瀏覽代碼

merge trunk + branch sched_ctx part 2

Andra Hugo 13 年之前
父節點
當前提交
131269917d
共有 96 個文件被更改,包括 6704 次插入1476 次删除
  1. 1 0
      src/.gitignore
  2. 3 0
      src/common/.gitignore
  3. 47 13
      src/common/barrier.c
  4. 4 2
      src/common/barrier.h
  5. 48 18
      src/common/fxt.c
  6. 50 22
      src/common/fxt.h
  7. 3 3
      src/common/starpu_spinlock.c
  8. 972 0
      src/common/uthash.h
  9. 4 4
      src/common/utils.h
  10. 3 1
      src/core/combined_workers.c
  11. 9 3
      src/core/debug.c
  12. 4 1
      src/core/debug.h
  13. 5 3
      src/core/dependencies/cg.c
  14. 10 4
      src/core/dependencies/implicit_data_deps.c
  15. 1 1
      src/core/dependencies/implicit_data_deps.h
  16. 1 3
      src/core/dependencies/tags.c
  17. 14 6
      src/core/jobs.c
  18. 2 1
      src/core/jobs.h
  19. 29 10
      src/core/perfmodel/perfmodel.c
  20. 2 2
      src/core/perfmodel/perfmodel.h
  21. 71 36
      src/core/perfmodel/perfmodel_bus.c
  22. 19 30
      src/core/perfmodel/perfmodel_history.c
  23. 12 16
      src/core/sched_policy.c
  24. 9 1
      src/core/task.c
  25. 22 9
      src/core/topology.c
  26. 57 10
      src/core/workers.c
  27. 2 2
      src/core/workers.h
  28. 256 142
      src/datawizard/coherency.c
  29. 8 3
      src/datawizard/coherency.h
  30. 68 41
      src/datawizard/copy_driver.c
  31. 8 3
      src/datawizard/copy_driver.h
  32. 27 36
      src/datawizard/data_request.c
  33. 2 6
      src/datawizard/data_request.h
  34. 40 12
      src/datawizard/filters.c
  35. 6 1
      src/datawizard/footprint.c
  36. 1 1
      src/datawizard/footprint.h
  37. 2 2
      src/datawizard/interfaces/bcsr_filters.c
  38. 55 55
      src/datawizard/interfaces/bcsr_interface.c
  39. 2 2
      src/datawizard/interfaces/block_filters.c
  40. 72 60
      src/datawizard/interfaces/block_interface.c
  41. 2 2
      src/datawizard/interfaces/csr_filters.c
  42. 240 50
      src/datawizard/interfaces/csr_interface.c
  43. 140 5
      src/datawizard/interfaces/data_interface.c
  44. 10 2
      src/datawizard/interfaces/data_interface.h
  45. 3 3
      src/datawizard/interfaces/matrix_filters.c
  46. 201 110
      src/datawizard/interfaces/matrix_interface.c
  47. 136 60
      src/datawizard/interfaces/variable_interface.c
  48. 5 5
      src/datawizard/interfaces/vector_filters.c
  49. 154 71
      src/datawizard/interfaces/vector_interface.c
  50. 32 30
      src/datawizard/interfaces/void_interface.c
  51. 41 3
      src/datawizard/memalloc.c
  52. 8 1
      src/datawizard/memory_nodes.c
  53. 5 1
      src/datawizard/memory_nodes.h
  54. 2 1
      src/datawizard/sort_data_handles.c
  55. 23 4
      src/datawizard/user_interactions.c
  56. 12 21
      src/datawizard/write_back.c
  57. 7 6
      src/debug/structures_size.c
  58. 1252 0
      src/debug/traces/starpu_fxt.c
  59. 63 0
      src/debug/traces/starpu_fxt.h
  60. 107 0
      src/debug/traces/starpu_fxt_dag.c
  61. 239 0
      src/debug/traces/starpu_fxt_mpi.c
  62. 157 0
      src/debug/traces/starpu_paje.c
  63. 11 52
      src/drivers/cpu/driver_cpu.c
  64. 65 38
      src/drivers/cuda/driver_cuda.c
  65. 65 3
      src/drivers/driver_common/driver_common.c
  66. 4 3
      src/drivers/driver_common/driver_common.h
  67. 5 3
      src/drivers/gordon/driver_gordon.c
  68. 62 63
      src/drivers/opencl/driver_opencl.c
  69. 175 6
      src/drivers/opencl/driver_opencl_utils.c
  70. 6 10
      src/profiling/bound.c
  71. 4 3
      src/profiling/profiling.c
  72. 3 1
      src/profiling/profiling.h
  73. 15 2
      src/profiling/profiling_helpers.c
  74. 15 35
      src/sched_policies/deque_modeling_policy_data_aware.c
  75. 39 5
      src/sched_policies/detect_combined_workers.c
  76. 1 25
      src/sched_policies/eager_central_policy.c
  77. 1 2
      src/sched_policies/eager_central_priority_policy.c
  78. 4 15
      src/sched_policies/fifo_queues.c
  79. 0 1
      src/sched_policies/fifo_queues.h
  80. 39 23
      src/sched_policies/heft.c
  81. 2 2
      src/sched_policies/parallel_greedy.c
  82. 129 190
      src/sched_policies/parallel_heft.c
  83. 2 10
      src/sched_policies/random_policy.c
  84. 5 16
      src/sched_policies/stack_queues.c
  85. 0 1
      src/sched_policies/stack_queues.h
  86. 4 3
      src/sched_policies/work_stealing_policy.c
  87. 756 0
      src/top/starpu_top.c
  88. 168 0
      src/top/starputop_connection.c
  89. 44 0
      src/top/starputop_connection.h
  90. 109 0
      src/top/starputop_message_queue.c
  91. 50 0
      src/top/starputop_message_queue.h
  92. 97 0
      src/top/starputop_task.c
  93. 7 7
      src/util/malloc.c
  94. 6 4
      src/util/starpu_cublas.c
  95. 2 2
      src/util/starpu_insert_task.c
  96. 19 11
      src/util/starpu_insert_task_utils.c

+ 1 - 0
src/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 3 - 0
src/common/.gitignore

@@ -0,0 +1,3 @@
+/stamp-h1
+/config.h
+/config.h.in

+ 47 - 13
src/common/barrier.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010,2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,20 +15,46 @@
  */
 
 #include <common/barrier.h>
+#include <common/utils.h>
 
 int _starpu_barrier_init(_starpu_barrier_t *barrier, int count)
 {
 	barrier->count = count;
-	barrier->reached = 0;
-	pthread_mutex_init(&barrier->mutex,NULL);
-	pthread_cond_init(&barrier->cond,NULL);
+	barrier->reached_start = 0;
+	barrier->reached_exit = 0;
+	PTHREAD_MUTEX_INIT(&barrier->mutex, NULL);
+	PTHREAD_MUTEX_INIT(&barrier->mutex_exit, NULL);
+	PTHREAD_COND_INIT(&barrier->cond, NULL);
 	return 0;
 }
 
+static
+int _starpu_barrier_test(_starpu_barrier_t *barrier)
+{
+    /*
+     * Check whether any threads are known to be waiting; report
+     * "BUSY" if so.
+     */
+        PTHREAD_MUTEX_LOCK(&barrier->mutex_exit);
+        if (barrier->reached_exit != barrier->count) {
+                PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
+                return EBUSY;
+        }
+        PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
+        return 0;
+}
+
 int _starpu_barrier_destroy(_starpu_barrier_t *barrier)
 {
-	pthread_mutex_destroy(&barrier->mutex);
-	pthread_cond_destroy(&barrier->cond);
+	int ret = _starpu_barrier_test(barrier);
+	while (ret == EBUSY) {
+		ret = _starpu_barrier_test(barrier);
+	}
+	_STARPU_DEBUG("reached_exit %d\n", barrier->reached_exit);
+
+	PTHREAD_MUTEX_DESTROY(&barrier->mutex);
+	PTHREAD_MUTEX_DESTROY(&barrier->mutex_exit);
+	PTHREAD_COND_DESTROY(&barrier->cond);
 	return 0;
 }
 
@@ -36,18 +62,26 @@ int _starpu_barrier_wait(_starpu_barrier_t *barrier)
 {
 	int ret=0;
 
-	pthread_mutex_lock(&barrier->mutex);
-	barrier->reached++;
-	if (barrier->reached == barrier->count)
+        // Wait until all threads enter the barrier
+	PTHREAD_MUTEX_LOCK(&barrier->mutex);
+	barrier->reached_exit=0;
+	barrier->reached_start++;
+	if (barrier->reached_start == barrier->count)
 	{
-		barrier->reached = 0;
-		pthread_cond_broadcast(&barrier->cond);
+		barrier->reached_start = 0;
+		PTHREAD_COND_BROADCAST(&barrier->cond);
 		ret = PTHREAD_BARRIER_SERIAL_THREAD;
 	}
 	else
 	{
-		pthread_cond_wait(&barrier->cond,&barrier->mutex);
+                PTHREAD_COND_WAIT(&barrier->cond,&barrier->mutex);
 	}
-	pthread_mutex_unlock(&barrier->mutex);
+	PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+
+        // Count number of threads that exit the barrier
+	PTHREAD_MUTEX_LOCK(&barrier->mutex_exit);
+	barrier->reached_exit ++;
+	PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
+
 	return ret;
 }

+ 4 - 2
src/common/barrier.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,8 +21,10 @@
 
 typedef struct {
 	int count;
-	int reached;
+	int reached_start;
+	int reached_exit;
 	pthread_mutex_t mutex;
+	pthread_mutex_t mutex_exit;
 	pthread_cond_t cond;
 } _starpu_barrier_t;
 

+ 48 - 18
src/common/fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,8 +17,11 @@
 
 #include <starpu.h>
 #include <common/config.h>
-#ifdef STARPU_USE_FXT
+#include <common/utils.h>
+#include <starpu_util.h>
+#include <starpu_profiling.h>
 
+#ifdef STARPU_USE_FXT
 #include <common/fxt.h>
 
 #ifdef STARPU_HAVE_WINDOWS
@@ -32,39 +35,44 @@ static int fxt_started = 0;
 
 static int written = 0;
 
-static void profile_set_tracefile(char *fmt, ...)
+static int id;
+
+static void _profile_set_tracefile(void *last, ...)
 {
 	va_list vl;
 	char *user;
-	
-	va_start(vl, fmt);
-	vsprintf(PROF_FILE_USER, fmt, vl);
+
+        char *fxt_prefix = getenv("STARPU_FXT_PREFIX");
+        if (!fxt_prefix)
+			fxt_prefix = "/tmp/";
+
+	va_start(vl, last);
+	vsprintf(PROF_FILE_USER, fxt_prefix, vl);
 	va_end(vl);
 
 	user = getenv("USER");
 	if (!user)
 		user = "";
 
-	int pid = getpid();
-
 	char suffix[128];
-	snprintf(suffix, 128, "prof_file_%s_%d", user, pid);
+	snprintf(suffix, 128, "prof_file_%s_%d", user, id);
 
 	strcat(PROF_FILE_USER, suffix);
 }
 
+void starpu_set_profiling_id(int new_id) {
+        _STARPU_DEBUG("Set id to <%d>\n", new_id);
+	id = new_id;
+        _profile_set_tracefile(NULL);
+}
+
 void _starpu_start_fxt_profiling(void)
 {
 	unsigned threadid;
 
 	if (!fxt_started) {
 		fxt_started = 1;
-
-		char *fxt_prefix = getenv("STARPU_FXT_PREFIX");
-		if (!fxt_prefix)
-			fxt_prefix = "/tmp/";
-
-		profile_set_tracefile(fxt_prefix);
+		_profile_set_tracefile(NULL);
 	}
 
 	threadid = syscall(SYS_gettid);
@@ -81,6 +89,23 @@ void _starpu_start_fxt_profiling(void)
 	return;
 }
 
+static void generate_paje_trace(char *input_fxt_filename, char *output_paje_filename)
+{
+	/* We take default options */
+	struct starpu_fxt_options options;
+	starpu_fxt_options_init(&options);
+
+	/* TODO parse some STARPU_GENERATE_TRACE_OPTIONS env variable */
+
+	options.ninputfiles = 1;
+	options.filenames[0] = input_fxt_filename;
+	options.out_paje_path = output_paje_filename;
+	options.file_prefix = "";
+	options.file_rank = -1;
+
+	starpu_fxt_generate_trace(&options);
+}
+
 void _starpu_stop_fxt_profiling(void)
 {
 	if (!written)
@@ -92,6 +117,11 @@ void _starpu_stop_fxt_profiling(void)
 #endif
 		fut_endup(PROF_FILE_USER);
 
+		/* Should we generate a Paje trace directly ? */
+		int generate_trace = starpu_get_env_number("STARPU_GENERATE_TRACE");
+		if (generate_trace == 1)
+			generate_paje_trace(PROF_FILE_USER, "paje.trace");
+
 		int ret = fut_done();
 		if (ret < 0)
 		{
@@ -109,9 +139,9 @@ void _starpu_fxt_register_thread(unsigned cpuid)
 	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, syscall(SYS_gettid));
 }
 
-#endif
+#endif // STARPU_USE_FXT
 
-void starpu_trace_user_event(unsigned long code __attribute__((unused)))
+void starpu_trace_user_event(unsigned long code STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_USE_FXT
 	STARPU_TRACE_USER_EVENT(code);

+ 50 - 22
src/common/fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -117,33 +117,58 @@ void _starpu_fxt_register_thread(unsigned);
 /* Sometimes we need something a little more specific than the wrappers from
  * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
  * by a string. */
-#define STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)				\
+#define STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)			\
 do {									\
+	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
-	size_t len = strlen((str)) + 1;					\
-	unsigned nbargs = 3 + (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 3)*sizeof(unsigned long));\
+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	unsigned nbargs = 3 + nbargs_str;				\
 	size_t total_len = FUT_SIZE(nbargs);				\
-	unsigned long *args =						\
+	unsigned long *futargs =					\
 		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
-	*(args++) = (unsigned long)(P1);				\
-	*(args++) = (unsigned long)(P2);				\
-	*(args++) = (unsigned long)(P3);				\
-	sprintf((char *)args, "%s", str);				\
+	*(futargs++) = (unsigned long)(P1);				\
+	*(futargs++) = (unsigned long)(P2);				\
+	*(futargs++) = (unsigned long)(P3);				\
+	snprintf((char *)futargs, len, "%s", str);			\
+	((char *)futargs)[len - 1] = '\0';				\
 } while (0);
 
 #define STARPU_FUT_DO_PROBE4STR(CODE, P1, P2, P3, P4, str)		\
 do {									\
+	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
-	size_t len = strlen((str)) + 1;					\
-	unsigned nbargs = 4 + (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long));\
+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	unsigned nbargs = 4 + nbargs_str;				\
 	size_t total_len = FUT_SIZE(nbargs);				\
-	unsigned long *args =						\
+	unsigned long *futargs =						\
 		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
-	*(args++) = (unsigned long)(P1);				\
-	*(args++) = (unsigned long)(P2);				\
-	*(args++) = (unsigned long)(P3);				\
-	*(args++) = (unsigned long)(P4);				\
-	sprintf((char *)args, "%s", str);				\
+	*(futargs++) = (unsigned long)(P1);				\
+	*(futargs++) = (unsigned long)(P2);				\
+	*(futargs++) = (unsigned long)(P3);				\
+	*(futargs++) = (unsigned long)(P4);				\
+	snprintf((char *)futargs, len, "%s", str);			\
+	((char *)futargs)[len - 1] = '\0';				\
+} while (0);
+
+#define STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)		\
+do {									\
+	/* No more than FXT_MAX_PARAMS args are allowed */		\
+	/* we add a \0 just in case ... */				\
+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 5)*sizeof(unsigned long));\
+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	unsigned nbargs = 5 + nbargs_str;				\
+	size_t total_len = FUT_SIZE(nbargs);				\
+	unsigned long *futargs =					\
+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
+	*(futargs++) = (unsigned long)(P1);				\
+	*(futargs++) = (unsigned long)(P2);				\
+	*(futargs++) = (unsigned long)(P3);				\
+	*(futargs++) = (unsigned long)(P4);				\
+	*(futargs++) = (unsigned long)(P5);				\
+	snprintf((char *)futargs, len, "%s", str);			\
+	((char *)futargs)[len - 1] = '\0';				\
 } while (0);
 
 
@@ -160,7 +185,7 @@ do {									\
 
 #define STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
-        const char *model_name = _starpu_get_model_name((job));               \
+        const char *model_name = _starpu_get_model_name((job));         \
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
@@ -171,9 +196,12 @@ do {									\
 	}								\
 } while(0);
 
-
-#define STARPU_TRACE_END_CODELET_BODY(job)	\
-	FUT_DO_PROBE2(STARPU_FUT_END_CODELET_BODY, job, syscall(SYS_gettid));
+#define STARPU_TRACE_END_CODELET_BODY(job, archtype)			\
+do {									\
+	const size_t job_size = _starpu_job_get_data_size((job));	\
+	const uint32_t job_hash = _starpu_compute_buffers_footprint(job);\
+	FUT_DO_PROBE5(STARPU_FUT_END_CODELET_BODY, job, (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
+} while(0);
 
 #define STARPU_TRACE_START_CALLBACK(job)	\
 	FUT_DO_PROBE2(STARPU_FUT_START_CALLBACK, job, syscall(SYS_gettid));
@@ -315,7 +343,7 @@ do {										\
 #define STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0);
 #define STARPU_TRACE_WORKER_INIT_END		do {} while(0);
 #define STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0);
-#define STARPU_TRACE_END_CODELET_BODY(job)	do {} while(0);
+#define STARPU_TRACE_END_CODELET_BODY(job, a)	do {} while(0);
 #define STARPU_TRACE_START_CALLBACK(job)	do {} while(0);
 #define STARPU_TRACE_END_CALLBACK(job)		do {} while(0);
 #define STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0);

+ 3 - 3
src/common/starpu_spinlock.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,7 +44,7 @@ int _starpu_spin_init(starpu_spinlock_t *lock)
 #endif
 }
 
-int _starpu_spin_destroy(starpu_spinlock_t *lock)
+int _starpu_spin_destroy(starpu_spinlock_t *lock STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SPINLOCK_CHECK
 	pthread_mutexattr_destroy(&lock->errcheck_attr);
@@ -101,7 +101,7 @@ int _starpu_spin_trylock(starpu_spinlock_t *lock)
 #endif
 }
 
-int _starpu_spin_unlock(starpu_spinlock_t *lock)
+int _starpu_spin_unlock(starpu_spinlock_t *lock STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SPINLOCK_CHECK
 	int ret = pthread_mutex_unlock(&lock->errcheck_lock);

+ 972 - 0
src/common/uthash.h

@@ -0,0 +1,972 @@
+/*
+Copyright (c) 2003-2010, Troy D. Hanson     http://uthash.sourceforge.net
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef UTHASH_H
+#define UTHASH_H 
+
+#include <string.h>   /* memcmp,strlen */
+#include <stddef.h>   /* ptrdiff_t */
+
+/* These macros use decltype or the earlier __typeof GNU extension.
+   As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+   when compiling c++ source) this code uses whatever method is needed
+   or, for VS2008 where neither is available, uses casting workarounds. */
+#ifdef _MSC_VER         /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
+#define DECLTYPE(x) (decltype(x))
+#else                   /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#define DECLTYPE(x)
+#endif
+#else                   /* GNU, Sun and other compilers */
+#define DECLTYPE(x) (__typeof(x))
+#endif
+
+#ifdef NO_DECLTYPE
+#define DECLTYPE_ASSIGN(dst,src)                                                 \
+do {                                                                             \
+  char **_da_dst = (char**)(&(dst));                                             \
+  *_da_dst = (char*)(src);                                                       \
+} while(0)
+#else 
+#define DECLTYPE_ASSIGN(dst,src)                                                 \
+do {                                                                             \
+  (dst) = DECLTYPE(dst)(src);                                                    \
+} while(0)
+#endif
+
+/* a number of the hash function use uint32_t which isn't defined on win32 */
+#ifdef _MSC_VER
+typedef unsigned int uint32_t;
+#else
+#include <inttypes.h>   /* uint32_t */
+#endif
+
+#define UTHASH_VERSION 1.9.3
+
+#define uthash_fatal(msg) exit(-1)        /* fatal error (out of memory,etc) */
+#define uthash_malloc(sz) malloc(sz)      /* malloc fcn                      */
+#define uthash_free(ptr,sz) free(ptr)     /* free fcn                        */
+
+#define uthash_noexpand_fyi(tbl)          /* can be defined to log noexpand  */
+#define uthash_expand_fyi(tbl)            /* can be defined to log expands   */
+
+/* initial number of buckets */
+#define HASH_INITIAL_NUM_BUCKETS 32      /* initial number of buckets        */
+#define HASH_INITIAL_NUM_BUCKETS_LOG2 5  /* lg2 of initial number of buckets */
+#define HASH_BKT_CAPACITY_THRESH 10      /* expand when bucket count reaches */
+
+/* calculate the element whose hash handle address is hhe */
+#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho)))
+
+#define HASH_FIND(hh,head,keyptr,keylen,out)                                     \
+do {                                                                             \
+  unsigned _hf_bkt,_hf_hashv;                                                    \
+  out=NULL;                                                                      \
+  if (head) {                                                                    \
+     HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt);   \
+     if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) {                           \
+       HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ],  \
+                        keyptr,keylen,out);                                      \
+     }                                                                           \
+  }                                                                              \
+} while (0)
+
+#ifdef HASH_BLOOM
+#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM)
+#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0)
+#define HASH_BLOOM_MAKE(tbl)                                                     \
+do {                                                                             \
+  (tbl)->bloom_nbits = HASH_BLOOM;                                               \
+  (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN);                 \
+  if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
+  memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
+  (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
+} while (0);
+
+#define HASH_BLOOM_FREE(tbl)                                                     \
+do {                                                                             \
+  uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
+} while (0);
+
+#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
+#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
+
+#define HASH_BLOOM_ADD(tbl,hashv)                                                \
+  HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
+
+#define HASH_BLOOM_TEST(tbl,hashv)                                               \
+  HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
+
+#else
+#define HASH_BLOOM_MAKE(tbl) 
+#define HASH_BLOOM_FREE(tbl) 
+#define HASH_BLOOM_ADD(tbl,hashv) 
+#define HASH_BLOOM_TEST(tbl,hashv) (1)
+#endif
+
+#define HASH_MAKE_TABLE(hh,head)                                                 \
+do {                                                                             \
+  (head)->hh.tbl = (UT_hash_table*)uthash_malloc(                                \
+                  sizeof(UT_hash_table));                                        \
+  if (!((head)->hh.tbl))  { uthash_fatal( "out of memory"); }                    \
+  memset((head)->hh.tbl, 0, sizeof(UT_hash_table));                              \
+  (head)->hh.tbl->tail = &((head)->hh);                                          \
+  (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS;                        \
+  (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2;              \
+  (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head);                    \
+  (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc(                      \
+          HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
+  if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); }             \
+  memset((head)->hh.tbl->buckets, 0,                                             \
+          HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
+  HASH_BLOOM_MAKE((head)->hh.tbl);                                               \
+  (head)->hh.tbl->signature = HASH_SIGNATURE;                                    \
+} while(0)
+
+#define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
+        HASH_ADD_KEYPTR(hh,head,&add->fieldname,keylen_in,add)
+ 
+#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
+do {                                                                             \
+ unsigned _ha_bkt;                                                               \
+ (add)->hh.next = NULL;                                                          \
+ (add)->hh.key = (char*)keyptr;                                                  \
+ (add)->hh.keylen = keylen_in;                                                   \
+ if (!(head)) {                                                                  \
+    head = (add);                                                                \
+    (head)->hh.prev = NULL;                                                      \
+    HASH_MAKE_TABLE(hh,head);                                                    \
+ } else {                                                                        \
+    (head)->hh.tbl->tail->next = (add);                                          \
+    (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail);         \
+    (head)->hh.tbl->tail = &((add)->hh);                                         \
+ }                                                                               \
+ (head)->hh.tbl->num_items++;                                                    \
+ (add)->hh.tbl = (head)->hh.tbl;                                                 \
+ HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets,                         \
+         (add)->hh.hashv, _ha_bkt);                                              \
+ HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh);                   \
+ HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv);                                 \
+ HASH_EMIT_KEY(hh,head,keyptr,keylen_in);                                        \
+ HASH_FSCK(hh,head);                                                             \
+} while(0)
+
+#define HASH_TO_BKT( hashv, num_bkts, bkt )                                      \
+do {                                                                             \
+  bkt = ((hashv) & ((num_bkts) - 1));                                            \
+} while(0)
+
+/* delete "delptr" from the hash table.
+ * "the usual" patch-up process for the app-order doubly-linked-list.
+ * The use of _hd_hh_del below deserves special explanation.
+ * These used to be expressed using (delptr) but that led to a bug
+ * if someone used the same symbol for the head and deletee, like
+ *  HASH_DELETE(hh,users,users);
+ * We want that to work, but by changing the head (users) below
+ * we were forfeiting our ability to further refer to the deletee (users)
+ * in the patch-up process. Solution: use scratch space to
+ * copy the deletee pointer, then the latter references are via that
+ * scratch pointer rather than through the repointed (users) symbol.
+ */
+#define HASH_DELETE(hh,head,delptr)                                              \
+do {                                                                             \
+    unsigned _hd_bkt;                                                            \
+    struct UT_hash_handle *_hd_hh_del;                                           \
+    if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) )  {         \
+        uthash_free((head)->hh.tbl->buckets,                                     \
+                    (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
+        HASH_BLOOM_FREE((head)->hh.tbl);                                         \
+        uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                      \
+        head = NULL;                                                             \
+    } else {                                                                     \
+        _hd_hh_del = &((delptr)->hh);                                            \
+        if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) {     \
+            (head)->hh.tbl->tail =                                               \
+                (UT_hash_handle*)((char*)((delptr)->hh.prev) +                   \
+                (head)->hh.tbl->hho);                                            \
+        }                                                                        \
+        if ((delptr)->hh.prev) {                                                 \
+            ((UT_hash_handle*)((char*)((delptr)->hh.prev) +                      \
+                    (head)->hh.tbl->hho))->next = (delptr)->hh.next;             \
+        } else {                                                                 \
+            DECLTYPE_ASSIGN(head,(delptr)->hh.next);                             \
+        }                                                                        \
+        if (_hd_hh_del->next) {                                                  \
+            ((UT_hash_handle*)((char*)_hd_hh_del->next +                         \
+                    (head)->hh.tbl->hho))->prev =                                \
+                    _hd_hh_del->prev;                                            \
+        }                                                                        \
+        HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);   \
+        HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del);        \
+        (head)->hh.tbl->num_items--;                                             \
+    }                                                                            \
+    HASH_FSCK(hh,head);                                                          \
+} while (0)
+
+
+/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
+#define HASH_FIND_STR(head,findstr,out)                                          \
+    HASH_FIND(hh,head,findstr,strlen(findstr),out)
+#define HASH_ADD_STR(head,strfield,add)                                          \
+    HASH_ADD(hh,head,strfield,strlen(add->strfield),add)
+#define HASH_FIND_INT(head,findint,out)                                          \
+    HASH_FIND(hh,head,findint,sizeof(int),out)
+#define HASH_ADD_INT(head,intfield,add)                                          \
+    HASH_ADD(hh,head,intfield,sizeof(int),add)
+#define HASH_FIND_PTR(head,findptr,out)                                          \
+    HASH_FIND(hh,head,findptr,sizeof(void *),out)
+#define HASH_ADD_PTR(head,ptrfield,add)                                          \
+    HASH_ADD(hh,head,ptrfield,sizeof(void *),add)
+#define HASH_DEL(head,delptr)                                                    \
+    HASH_DELETE(hh,head,delptr)
+
+/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined.
+ * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined.
+ */
+#ifdef HASH_DEBUG
+#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0)
+#define HASH_FSCK(hh,head)                                                       \
+do {                                                                             \
+    unsigned _bkt_i;                                                             \
+    unsigned _count, _bkt_count;                                                 \
+    char *_prev;                                                                 \
+    struct UT_hash_handle *_thh;                                                 \
+    if (head) {                                                                  \
+        _count = 0;                                                              \
+        for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) {       \
+            _bkt_count = 0;                                                      \
+            _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head;                      \
+            _prev = NULL;                                                        \
+            while (_thh) {                                                       \
+               if (_prev != (char*)(_thh->hh_prev)) {                            \
+                   HASH_OOPS("invalid hh_prev %p, actual %p\n",                  \
+                    _thh->hh_prev, _prev );                                      \
+               }                                                                 \
+               _bkt_count++;                                                     \
+               _prev = (char*)(_thh);                                            \
+               _thh = _thh->hh_next;                                             \
+            }                                                                    \
+            _count += _bkt_count;                                                \
+            if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {          \
+               HASH_OOPS("invalid bucket count %d, actual %d\n",                 \
+                (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);              \
+            }                                                                    \
+        }                                                                        \
+        if (_count != (head)->hh.tbl->num_items) {                               \
+            HASH_OOPS("invalid hh item count %d, actual %d\n",                   \
+                (head)->hh.tbl->num_items, _count );                             \
+        }                                                                        \
+        /* traverse hh in app order; check next/prev integrity, count */         \
+        _count = 0;                                                              \
+        _prev = NULL;                                                            \
+        _thh =  &(head)->hh;                                                     \
+        while (_thh) {                                                           \
+           _count++;                                                             \
+           if (_prev !=(char*)(_thh->prev)) {                                    \
+              HASH_OOPS("invalid prev %p, actual %p\n",                          \
+                    _thh->prev, _prev );                                         \
+           }                                                                     \
+           _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh);                    \
+           _thh = ( _thh->next ?  (UT_hash_handle*)((char*)(_thh->next) +        \
+                                  (head)->hh.tbl->hho) : NULL );                 \
+        }                                                                        \
+        if (_count != (head)->hh.tbl->num_items) {                               \
+            HASH_OOPS("invalid app item count %d, actual %d\n",                  \
+                (head)->hh.tbl->num_items, _count );                             \
+        }                                                                        \
+    }                                                                            \
+} while (0)
+#else
+#define HASH_FSCK(hh,head) 
+#endif
+
+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to 
+ * the descriptor to which this macro is defined for tuning the hash function.
+ * The app can #include <unistd.h> to get the prototype for write(2). */
+#ifdef HASH_EMIT_KEYS
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                                   \
+do {                                                                             \
+    unsigned _klen = fieldlen;                                                   \
+    write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                                \
+    write(HASH_EMIT_KEYS, keyptr, fieldlen);                                     \
+} while (0)
+#else 
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                    
+#endif
+
+/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */
+#ifdef HASH_FUNCTION 
+#define HASH_FCN HASH_FUNCTION
+#else
+#define HASH_FCN HASH_JEN
+#endif
+
+/* The Bernstein hash function, used in Perl prior to v5.6 */
+#define HASH_BER(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  unsigned _hb_keylen=keylen;                                                    \
+  char *_hb_key=(char*)(key);                                                    \
+  (hashv) = 0;                                                                   \
+  while (_hb_keylen--)  { (hashv) = ((hashv) * 33) + *_hb_key++; }               \
+  bkt = (hashv) & (num_bkts-1);                                                  \
+} while (0)
+
+
+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at 
+ * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */
+#define HASH_SAX(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  unsigned _sx_i;                                                                \
+  char *_hs_key=(char*)(key);                                                    \
+  hashv = 0;                                                                     \
+  for(_sx_i=0; _sx_i < keylen; _sx_i++)                                          \
+      hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i];                     \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while (0)
+
+#define HASH_FNV(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  unsigned _fn_i;                                                                \
+  char *_hf_key=(char*)(key);                                                    \
+  hashv = 2166136261UL;                                                          \
+  for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
+      hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while(0);
+ 
+#define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  unsigned _ho_i;                                                                \
+  char *_ho_key=(char*)(key);                                                    \
+  hashv = 0;                                                                     \
+  for(_ho_i=0; _ho_i < keylen; _ho_i++) {                                        \
+      hashv += _ho_key[_ho_i];                                                   \
+      hashv += (hashv << 10);                                                    \
+      hashv ^= (hashv >> 6);                                                     \
+  }                                                                              \
+  hashv += (hashv << 3);                                                         \
+  hashv ^= (hashv >> 11);                                                        \
+  hashv += (hashv << 15);                                                        \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while(0)
+
+#define HASH_JEN_MIX(a,b,c)                                                      \
+do {                                                                             \
+  a -= b; a -= c; a ^= ( c >> 13 );                                              \
+  b -= c; b -= a; b ^= ( a << 8 );                                               \
+  c -= a; c -= b; c ^= ( b >> 13 );                                              \
+  a -= b; a -= c; a ^= ( c >> 12 );                                              \
+  b -= c; b -= a; b ^= ( a << 16 );                                              \
+  c -= a; c -= b; c ^= ( b >> 5 );                                               \
+  a -= b; a -= c; a ^= ( c >> 3 );                                               \
+  b -= c; b -= a; b ^= ( a << 10 );                                              \
+  c -= a; c -= b; c ^= ( b >> 15 );                                              \
+} while (0)
+
+#define HASH_JEN(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  unsigned _hj_i,_hj_j,_hj_k;                                                    \
+  char *_hj_key=(char*)(key);                                                    \
+  hashv = 0xfeedbeef;                                                            \
+  _hj_i = _hj_j = 0x9e3779b9;                                                    \
+  _hj_k = keylen;                                                                \
+  while (_hj_k >= 12) {                                                          \
+    _hj_i +=    (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 )                      \
+        + ( (unsigned)_hj_key[2] << 16 )                                         \
+        + ( (unsigned)_hj_key[3] << 24 ) );                                      \
+    _hj_j +=    (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 )                      \
+        + ( (unsigned)_hj_key[6] << 16 )                                         \
+        + ( (unsigned)_hj_key[7] << 24 ) );                                      \
+    hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 )                         \
+        + ( (unsigned)_hj_key[10] << 16 )                                        \
+        + ( (unsigned)_hj_key[11] << 24 ) );                                     \
+                                                                                 \
+     HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                          \
+                                                                                 \
+     _hj_key += 12;                                                              \
+     _hj_k -= 12;                                                                \
+  }                                                                              \
+  hashv += keylen;                                                               \
+  switch ( _hj_k ) {                                                             \
+     case 11: hashv += ( (unsigned)_hj_key[10] << 24 );                          \
+     case 10: hashv += ( (unsigned)_hj_key[9] << 16 );                           \
+     case 9:  hashv += ( (unsigned)_hj_key[8] << 8 );                            \
+     case 8:  _hj_j += ( (unsigned)_hj_key[7] << 24 );                           \
+     case 7:  _hj_j += ( (unsigned)_hj_key[6] << 16 );                           \
+     case 6:  _hj_j += ( (unsigned)_hj_key[5] << 8 );                            \
+     case 5:  _hj_j += _hj_key[4];                                               \
+     case 4:  _hj_i += ( (unsigned)_hj_key[3] << 24 );                           \
+     case 3:  _hj_i += ( (unsigned)_hj_key[2] << 16 );                           \
+     case 2:  _hj_i += ( (unsigned)_hj_key[1] << 8 );                            \
+     case 1:  _hj_i += _hj_key[0];                                               \
+  }                                                                              \
+  HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                             \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while(0)
+
+/* The Paul Hsieh hash function */
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__)             \
+  || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)             \
+                       +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+#define HASH_SFH(key,keylen,num_bkts,hashv,bkt)                                  \
+do {                                                                             \
+  char *_sfh_key=(char*)(key);                                                   \
+  uint32_t _sfh_tmp, _sfh_len = keylen;                                          \
+                                                                                 \
+  int _sfh_rem = _sfh_len & 3;                                                   \
+  _sfh_len >>= 2;                                                                \
+  hashv = 0xcafebabe;                                                            \
+                                                                                 \
+  /* Main loop */                                                                \
+  for (;_sfh_len > 0; _sfh_len--) {                                              \
+    hashv    += get16bits (_sfh_key);                                            \
+    _sfh_tmp       = (get16bits (_sfh_key+2) << 11) ^ hashv;                     \
+    hashv     = (hashv << 16) ^ _sfh_tmp;                                        \
+    _sfh_key += 2*sizeof (uint16_t);                                             \
+    hashv    += hashv >> 11;                                                     \
+  }                                                                              \
+                                                                                 \
+  /* Handle end cases */                                                         \
+  switch (_sfh_rem) {                                                            \
+    case 3: hashv += get16bits (_sfh_key);                                       \
+            hashv ^= hashv << 16;                                                \
+            hashv ^= _sfh_key[sizeof (uint16_t)] << 18;                          \
+            hashv += hashv >> 11;                                                \
+            break;                                                               \
+    case 2: hashv += get16bits (_sfh_key);                                       \
+            hashv ^= hashv << 11;                                                \
+            hashv += hashv >> 17;                                                \
+            break;                                                               \
+    case 1: hashv += *_sfh_key;                                                  \
+            hashv ^= hashv << 10;                                                \
+            hashv += hashv >> 1;                                                 \
+  }                                                                              \
+                                                                                 \
+    /* Force "avalanching" of final 127 bits */                                  \
+    hashv ^= hashv << 3;                                                         \
+    hashv += hashv >> 5;                                                         \
+    hashv ^= hashv << 4;                                                         \
+    hashv += hashv >> 17;                                                        \
+    hashv ^= hashv << 25;                                                        \
+    hashv += hashv >> 6;                                                         \
+    bkt = hashv & (num_bkts-1);                                                  \
+} while(0);
+
+#ifdef HASH_USING_NO_STRICT_ALIASING
+/* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.
+ * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error.
+ * So MurmurHash comes in two versions, the faster unaligned one and the slower
+ * aligned one. We only use the faster one on CPU's where we know it's safe. 
+ *
+ * Note the preprocessor built-in defines can be emitted using:
+ *
+ *   gcc -m64 -dM -E - < /dev/null                  (on gcc)
+ *   cc -## a.c (where a.c is a simple test file)   (Sun Studio)
+ */
+#if (defined(__i386__) || defined(__x86_64__)) 
+#define HASH_MUR HASH_MUR_UNALIGNED
+#else
+#define HASH_MUR HASH_MUR_ALIGNED
+#endif
+
+/* Appleby's MurmurHash fast version for unaligned-tolerant archs like i386 */
+#define HASH_MUR_UNALIGNED(key,keylen,num_bkts,hashv,bkt)                        \
+do {                                                                             \
+  const unsigned int _mur_m = 0x5bd1e995;                                        \
+  const int _mur_r = 24;                                                         \
+  hashv = 0xcafebabe ^ keylen;                                                   \
+  char *_mur_key = (char *)(key);                                                \
+  uint32_t _mur_tmp, _mur_len = keylen;                                          \
+                                                                                 \
+  for (;_mur_len >= 4; _mur_len-=4) {                                            \
+    _mur_tmp = *(uint32_t *)_mur_key;                                            \
+    _mur_tmp *= _mur_m;                                                          \
+    _mur_tmp ^= _mur_tmp >> _mur_r;                                              \
+    _mur_tmp *= _mur_m;                                                          \
+    hashv *= _mur_m;                                                             \
+    hashv ^= _mur_tmp;                                                           \
+    _mur_key += 4;                                                               \
+  }                                                                              \
+                                                                                 \
+  switch(_mur_len)                                                               \
+  {                                                                              \
+    case 3: hashv ^= _mur_key[2] << 16;                                          \
+    case 2: hashv ^= _mur_key[1] << 8;                                           \
+    case 1: hashv ^= _mur_key[0];                                                \
+            hashv *= _mur_m;                                                     \
+  };                                                                             \
+                                                                                 \
+  hashv ^= hashv >> 13;                                                          \
+  hashv *= _mur_m;                                                               \
+  hashv ^= hashv >> 15;                                                          \
+                                                                                 \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while(0)
+
+/* Appleby's MurmurHash version for alignment-sensitive archs like Sparc */
+#define HASH_MUR_ALIGNED(key,keylen,num_bkts,hashv,bkt)                          \
+do {                                                                             \
+  const unsigned int _mur_m = 0x5bd1e995;                                        \
+  const int _mur_r = 24;                                                         \
+  hashv = 0xcafebabe ^ (keylen);                                                 \
+  char *_mur_key = (char *)(key);                                                \
+  uint32_t _mur_len = keylen;                                                    \
+  int _mur_align = (int)_mur_key & 3;                                            \
+                                                                                 \
+  if (_mur_align && (_mur_len >= 4)) {                                           \
+    unsigned _mur_t = 0, _mur_d = 0;                                             \
+    switch(_mur_align) {                                                         \
+      case 1: _mur_t |= _mur_key[2] << 16;                                       \
+      case 2: _mur_t |= _mur_key[1] << 8;                                        \
+      case 3: _mur_t |= _mur_key[0];                                             \
+    }                                                                            \
+    _mur_t <<= (8 * _mur_align);                                                 \
+    _mur_key += 4-_mur_align;                                                    \
+    _mur_len -= 4-_mur_align;                                                    \
+    int _mur_sl = 8 * (4-_mur_align);                                            \
+    int _mur_sr = 8 * _mur_align;                                                \
+                                                                                 \
+    for (;_mur_len >= 4; _mur_len-=4) {                                          \
+      _mur_d = *(unsigned *)_mur_key;                                            \
+      _mur_t = (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);                        \
+      unsigned _mur_k = _mur_t;                                                  \
+      _mur_k *= _mur_m;                                                          \
+      _mur_k ^= _mur_k >> _mur_r;                                                \
+      _mur_k *= _mur_m;                                                          \
+      hashv *= _mur_m;                                                           \
+      hashv ^= _mur_k;                                                           \
+      _mur_t = _mur_d;                                                           \
+      _mur_key += 4;                                                             \
+    }                                                                            \
+    _mur_d = 0;                                                                  \
+    if(_mur_len >= _mur_align) {                                                 \
+      switch(_mur_align) {                                                       \
+        case 3: _mur_d |= _mur_key[2] << 16;                                     \
+        case 2: _mur_d |= _mur_key[1] << 8;                                      \
+        case 1: _mur_d |= _mur_key[0];                                           \
+      }                                                                          \
+      unsigned _mur_k = (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);               \
+      _mur_k *= _mur_m;                                                          \
+      _mur_k ^= _mur_k >> _mur_r;                                                \
+      _mur_k *= _mur_m;                                                          \
+      hashv *= _mur_m;                                                           \
+      hashv ^= _mur_k;                                                           \
+      _mur_k += _mur_align;                                                      \
+      _mur_len -= _mur_align;                                                    \
+                                                                                 \
+      switch(_mur_len)                                                           \
+      {                                                                          \
+        case 3: hashv ^= _mur_key[2] << 16;                                      \
+        case 2: hashv ^= _mur_key[1] << 8;                                       \
+        case 1: hashv ^= _mur_key[0];                                            \
+                hashv *= _mur_m;                                                 \
+      }                                                                          \
+    } else {                                                                     \
+      switch(_mur_len)                                                           \
+      {                                                                          \
+        case 3: _mur_d ^= _mur_key[2] << 16;                                     \
+        case 2: _mur_d ^= _mur_key[1] << 8;                                      \
+        case 1: _mur_d ^= _mur_key[0];                                           \
+        case 0: hashv ^= (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);              \
+        hashv *= _mur_m;                                                         \
+      }                                                                          \
+    }                                                                            \
+                                                                                 \
+    hashv ^= hashv >> 13;                                                        \
+    hashv *= _mur_m;                                                             \
+    hashv ^= hashv >> 15;                                                        \
+  } else {                                                                       \
+    for (;_mur_len >= 4; _mur_len-=4) {                                          \
+      unsigned _mur_k = *(unsigned*)_mur_key;                                    \
+      _mur_k *= _mur_m;                                                          \
+      _mur_k ^= _mur_k >> _mur_r;                                                \
+      _mur_k *= _mur_m;                                                          \
+      hashv *= _mur_m;                                                           \
+      hashv ^= _mur_k;                                                           \
+      _mur_key += 4;                                                             \
+    }                                                                            \
+    switch(_mur_len)                                                             \
+    {                                                                            \
+      case 3: hashv ^= _mur_key[2] << 16;                                        \
+      case 2: hashv ^= _mur_key[1] << 8;                                         \
+      case 1: hashv ^= _mur_key[0];                                              \
+      hashv *= _mur_m;                                                           \
+    }                                                                            \
+                                                                                 \
+    hashv ^= hashv >> 13;                                                        \
+    hashv *= _mur_m;                                                             \
+    hashv ^= hashv >> 15;                                                        \
+  }                                                                              \
+  bkt = hashv & (num_bkts-1);                                                    \
+} while(0)
+#endif  /* HASH_USING_NO_STRICT_ALIASING */
+
+/* key comparison function; return 0 if keys equal */
+#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) 
+
+/* iterate over items in a known bucket to find desired item */
+#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out)                       \
+do {                                                                             \
+ if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head));          \
+ else out=NULL;                                                                  \
+ while (out) {                                                                   \
+    if (out->hh.keylen == keylen_in) {                                           \
+        if ((HASH_KEYCMP(out->hh.key,keyptr,keylen_in)) == 0) break;             \
+    }                                                                            \
+    if (out->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,out->hh.hh_next)); \
+    else out = NULL;                                                             \
+ }                                                                               \
+} while(0)
+
+/* add an item to a bucket  */
+#define HASH_ADD_TO_BKT(head,addhh)                                              \
+do {                                                                             \
+ head.count++;                                                                   \
+ (addhh)->hh_next = head.hh_head;                                                \
+ (addhh)->hh_prev = NULL;                                                        \
+ if (head.hh_head) { (head).hh_head->hh_prev = (addhh); }                        \
+ (head).hh_head=addhh;                                                           \
+ if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH)             \
+     && (addhh)->tbl->noexpand != 1) {                                           \
+       HASH_EXPAND_BUCKETS((addhh)->tbl);                                        \
+ }                                                                               \
+} while(0)
+
+/* remove an item from a given bucket */
+#define HASH_DEL_IN_BKT(hh,head,hh_del)                                          \
+    (head).count--;                                                              \
+    if ((head).hh_head == hh_del) {                                              \
+      (head).hh_head = hh_del->hh_next;                                          \
+    }                                                                            \
+    if (hh_del->hh_prev) {                                                       \
+        hh_del->hh_prev->hh_next = hh_del->hh_next;                              \
+    }                                                                            \
+    if (hh_del->hh_next) {                                                       \
+        hh_del->hh_next->hh_prev = hh_del->hh_prev;                              \
+    }                                                                
+
+/* Bucket expansion has the effect of doubling the number of buckets
+ * and redistributing the items into the new buckets. Ideally the
+ * items will distribute more or less evenly into the new buckets
+ * (the extent to which this is true is a measure of the quality of
+ * the hash function as it applies to the key domain). 
+ * 
+ * With the items distributed into more buckets, the chain length
+ * (item count) in each bucket is reduced. Thus by expanding buckets
+ * the hash keeps a bound on the chain length. This bounded chain 
+ * length is the essence of how a hash provides constant time lookup.
+ * 
+ * The calculation of tbl->ideal_chain_maxlen below deserves some
+ * explanation. First, keep in mind that we're calculating the ideal
+ * maximum chain length based on the *new* (doubled) bucket count.
+ * In fractions this is just n/b (n=number of items,b=new num buckets).
+ * Since the ideal chain length is an integer, we want to calculate 
+ * ceil(n/b). We don't depend on floating point arithmetic in this
+ * hash, so to calculate ceil(n/b) with integers we could write
+ * 
+ *      ceil(n/b) = (n/b) + ((n%b)?1:0)
+ * 
+ * and in fact a previous version of this hash did just that.
+ * But now we have improved things a bit by recognizing that b is
+ * always a power of two. We keep its base 2 log handy (call it lb),
+ * so now we can write this with a bit shift and logical AND:
+ * 
+ *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
+ * 
+ */
+#define HASH_EXPAND_BUCKETS(tbl)                                                 \
+do {                                                                             \
+    unsigned _he_bkt;                                                            \
+    unsigned _he_bkt_i;                                                          \
+    struct UT_hash_handle *_he_thh, *_he_hh_nxt;                                 \
+    UT_hash_bucket *_he_new_buckets, *_he_newbkt;                                \
+    _he_new_buckets = (UT_hash_bucket*)uthash_malloc(                            \
+             2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));              \
+    if (!_he_new_buckets) { uthash_fatal( "out of memory"); }                    \
+    memset(_he_new_buckets, 0,                                                   \
+            2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));               \
+    tbl->ideal_chain_maxlen =                                                    \
+       (tbl->num_items >> (tbl->log2_num_buckets+1)) +                           \
+       ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0);                    \
+    tbl->nonideal_items = 0;                                                     \
+    for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++)                \
+    {                                                                            \
+        _he_thh = tbl->buckets[ _he_bkt_i ].hh_head;                             \
+        while (_he_thh) {                                                        \
+           _he_hh_nxt = _he_thh->hh_next;                                        \
+           HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt);            \
+           _he_newbkt = &(_he_new_buckets[ _he_bkt ]);                           \
+           if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) {                \
+             tbl->nonideal_items++;                                              \
+             _he_newbkt->expand_mult = _he_newbkt->count /                       \
+                                        tbl->ideal_chain_maxlen;                 \
+           }                                                                     \
+           _he_thh->hh_prev = NULL;                                              \
+           _he_thh->hh_next = _he_newbkt->hh_head;                               \
+           if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev =               \
+                _he_thh;                                                         \
+           _he_newbkt->hh_head = _he_thh;                                        \
+           _he_thh = _he_hh_nxt;                                                 \
+        }                                                                        \
+    }                                                                            \
+    uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
+    tbl->num_buckets *= 2;                                                       \
+    tbl->log2_num_buckets++;                                                     \
+    tbl->buckets = _he_new_buckets;                                              \
+    tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ?         \
+        (tbl->ineff_expands+1) : 0;                                              \
+    if (tbl->ineff_expands > 1) {                                                \
+        tbl->noexpand=1;                                                         \
+        uthash_noexpand_fyi(tbl);                                                \
+    }                                                                            \
+    uthash_expand_fyi(tbl);                                                      \
+} while(0)
+
+
+/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
+/* Note that HASH_SORT assumes the hash handle name to be hh. 
+ * HASH_SRT was added to allow the hash handle name to be passed in. */
+#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
+#define HASH_SRT(hh,head,cmpfcn)                                                 \
+do {                                                                             \
+  unsigned _hs_i;                                                                \
+  unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize;               \
+  struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail;            \
+  if (head) {                                                                    \
+      _hs_insize = 1;                                                            \
+      _hs_looping = 1;                                                           \
+      _hs_list = &((head)->hh);                                                  \
+      while (_hs_looping) {                                                      \
+          _hs_p = _hs_list;                                                      \
+          _hs_list = NULL;                                                       \
+          _hs_tail = NULL;                                                       \
+          _hs_nmerges = 0;                                                       \
+          while (_hs_p) {                                                        \
+              _hs_nmerges++;                                                     \
+              _hs_q = _hs_p;                                                     \
+              _hs_psize = 0;                                                     \
+              for ( _hs_i = 0; _hs_i  < _hs_insize; _hs_i++ ) {                  \
+                  _hs_psize++;                                                   \
+                  _hs_q = (UT_hash_handle*)((_hs_q->next) ?                      \
+                          ((void*)((char*)(_hs_q->next) +                        \
+                          (head)->hh.tbl->hho)) : NULL);                         \
+                  if (! (_hs_q) ) break;                                         \
+              }                                                                  \
+              _hs_qsize = _hs_insize;                                            \
+              while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) {           \
+                  if (_hs_psize == 0) {                                          \
+                      _hs_e = _hs_q;                                             \
+                      _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
+                              ((void*)((char*)(_hs_q->next) +                    \
+                              (head)->hh.tbl->hho)) : NULL);                     \
+                      _hs_qsize--;                                               \
+                  } else if ( (_hs_qsize == 0) || !(_hs_q) ) {                   \
+                      _hs_e = _hs_p;                                             \
+                      _hs_p = (UT_hash_handle*)((_hs_p->next) ?                  \
+                              ((void*)((char*)(_hs_p->next) +                    \
+                              (head)->hh.tbl->hho)) : NULL);                     \
+                      _hs_psize--;                                               \
+                  } else if ((                                                   \
+                      cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \
+                             DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \
+                             ) <= 0) {                                           \
+                      _hs_e = _hs_p;                                             \
+                      _hs_p = (UT_hash_handle*)((_hs_p->next) ?                  \
+                              ((void*)((char*)(_hs_p->next) +                    \
+                              (head)->hh.tbl->hho)) : NULL);                     \
+                      _hs_psize--;                                               \
+                  } else {                                                       \
+                      _hs_e = _hs_q;                                             \
+                      _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
+                              ((void*)((char*)(_hs_q->next) +                    \
+                              (head)->hh.tbl->hho)) : NULL);                     \
+                      _hs_qsize--;                                               \
+                  }                                                              \
+                  if ( _hs_tail ) {                                              \
+                      _hs_tail->next = ((_hs_e) ?                                \
+                            ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL);          \
+                  } else {                                                       \
+                      _hs_list = _hs_e;                                          \
+                  }                                                              \
+                  _hs_e->prev = ((_hs_tail) ?                                    \
+                     ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL);              \
+                  _hs_tail = _hs_e;                                              \
+              }                                                                  \
+              _hs_p = _hs_q;                                                     \
+          }                                                                      \
+          _hs_tail->next = NULL;                                                 \
+          if ( _hs_nmerges <= 1 ) {                                              \
+              _hs_looping=0;                                                     \
+              (head)->hh.tbl->tail = _hs_tail;                                   \
+              DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list));      \
+          }                                                                      \
+          _hs_insize *= 2;                                                       \
+      }                                                                          \
+      HASH_FSCK(hh,head);                                                        \
+ }                                                                               \
+} while (0)
+
+/* This function selects items from one hash into another hash. 
+ * The end result is that the selected items have dual presence 
+ * in both hashes. There is no copy of the items made; rather 
+ * they are added into the new hash through a secondary hash 
+ * hash handle that must be present in the structure. */
+#define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                              \
+do {                                                                             \
+  unsigned _src_bkt, _dst_bkt;                                                   \
+  void *_last_elt=NULL, *_elt;                                                   \
+  UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL;                         \
+  ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst));                 \
+  if (src) {                                                                     \
+    for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) {     \
+      for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head;                \
+          _src_hh;                                                               \
+          _src_hh = _src_hh->hh_next) {                                          \
+          _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh);                       \
+          if (cond(_elt)) {                                                      \
+            _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho);               \
+            _dst_hh->key = _src_hh->key;                                         \
+            _dst_hh->keylen = _src_hh->keylen;                                   \
+            _dst_hh->hashv = _src_hh->hashv;                                     \
+            _dst_hh->prev = _last_elt;                                           \
+            _dst_hh->next = NULL;                                                \
+            if (_last_elt_hh) { _last_elt_hh->next = _elt; }                     \
+            if (!dst) {                                                          \
+              DECLTYPE_ASSIGN(dst,_elt);                                         \
+              HASH_MAKE_TABLE(hh_dst,dst);                                       \
+            } else {                                                             \
+              _dst_hh->tbl = (dst)->hh_dst.tbl;                                  \
+            }                                                                    \
+            HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt);    \
+            HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh);            \
+            (dst)->hh_dst.tbl->num_items++;                                      \
+            _last_elt = _elt;                                                    \
+            _last_elt_hh = _dst_hh;                                              \
+          }                                                                      \
+      }                                                                          \
+    }                                                                            \
+  }                                                                              \
+  HASH_FSCK(hh_dst,dst);                                                         \
+} while (0)
+
+#define HASH_CLEAR(hh,head)                                                      \
+do {                                                                             \
+  if (head) {                                                                    \
+    uthash_free((head)->hh.tbl->buckets,                                         \
+                (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket));      \
+    uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                          \
+    (head)=NULL;                                                                 \
+  }                                                                              \
+} while(0)
+
+#ifdef NO_DECLTYPE
+#define HASH_ITER(hh,head,el,tmp)                                                \
+for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL);       \
+  el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) 
+#else
+#define HASH_ITER(hh,head,el,tmp)                                                \
+for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL);                 \
+  el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL))
+#endif
+
+/* obtain a count of items in the hash */
+#define HASH_COUNT(head) HASH_CNT(hh,head) 
+#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0)
+
+typedef struct UT_hash_bucket {
+   struct UT_hash_handle *hh_head;
+   unsigned count;
+
+   /* expand_mult is normally set to 0. In this situation, the max chain length
+    * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
+    * the bucket's chain exceeds this length, bucket expansion is triggered). 
+    * However, setting expand_mult to a non-zero value delays bucket expansion
+    * (that would be triggered by additions to this particular bucket)
+    * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
+    * (The multiplier is simply expand_mult+1). The whole idea of this
+    * multiplier is to reduce bucket expansions, since they are expensive, in
+    * situations where we know that a particular bucket tends to be overused.
+    * It is better to let its chain length grow to a longer yet-still-bounded
+    * value, than to do an O(n) bucket expansion too often. 
+    */
+   unsigned expand_mult;
+
+} UT_hash_bucket;
+
+/* random signature used only to find hash tables in external analysis */
+#define HASH_SIGNATURE 0xa0111fe1
+#define HASH_BLOOM_SIGNATURE 0xb12220f2
+
+typedef struct UT_hash_table {
+   UT_hash_bucket *buckets;
+   unsigned num_buckets, log2_num_buckets;
+   unsigned num_items;
+   struct UT_hash_handle *tail; /* tail hh in app order, for fast append    */
+   ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
+
+   /* in an ideal situation (all buckets used equally), no bucket would have
+    * more than ceil(#items/#buckets) items. that's the ideal chain length. */
+   unsigned ideal_chain_maxlen;
+
+   /* nonideal_items is the number of items in the hash whose chain position
+    * exceeds the ideal chain maxlen. these items pay the penalty for an uneven
+    * hash distribution; reaching them in a chain traversal takes >ideal steps */
+   unsigned nonideal_items;
+
+   /* ineffective expands occur when a bucket doubling was performed, but 
+    * afterward, more than half the items in the hash had nonideal chain
+    * positions. If this happens on two consecutive expansions we inhibit any
+    * further expansion, as it's not helping; this happens when the hash
+    * function isn't a good fit for the key domain. When expansion is inhibited
+    * the hash will still work, albeit no longer in constant time. */
+   unsigned ineff_expands, noexpand;
+
+   uint32_t signature; /* used only to find hash tables in external analysis */
+#ifdef HASH_BLOOM
+   uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
+   uint8_t *bloom_bv;
+   char bloom_nbits;
+#endif
+
+} UT_hash_table;
+
+typedef struct UT_hash_handle {
+   struct UT_hash_table *tbl;
+   void *prev;                       /* prev element in app order      */
+   void *next;                       /* next element in app order      */
+   struct UT_hash_handle *hh_prev;   /* previous hh in bucket order    */
+   struct UT_hash_handle *hh_next;   /* next hh in bucket order        */
+   void *key;                        /* ptr to enclosing struct's key  */
+   unsigned keylen;                  /* enclosing struct's key len     */
+   unsigned hashv;                   /* result of hash-fcn(key)        */
+} UT_hash_handle;
+
+#endif /* UTHASH_H */

+ 4 - 4
src/common/utils.h

@@ -27,15 +27,15 @@
 #include <stdlib.h>
 
 #ifdef STARPU_VERBOSE
-#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); }} while(0)
+#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
 #else
 #  define _STARPU_DEBUG(fmt, args ...)
 #endif
 
 #ifdef STARPU_VERBOSE0
-#  define _STARPU_LOG_IN()             fprintf(stderr, "[starpu][%ld][%s] -->\n", pthread_self(), __func__ );
-#  define _STARPU_LOG_OUT()            fprintf(stderr, "[starpu][%ld][%s] <--\n", pthread_self(), __func__ );
-#  define _STARPU_LOG_OUT_TAG(outtag)  fprintf(stderr, "[starpu][%ld][%s] <-- (%s)\n", pthread_self(), __func__, outtag);
+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] -->\n", pthread_self(), __func__ ); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <--\n", pthread_self(), __func__ ); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <-- (%s)\n", pthread_self(), __func__, outtag); }} while(0)
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()

+ 3 - 1
src/core/combined_workers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -123,6 +123,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
 	for (i = 0; i < nworkers; i++)
 	{
+#if defined(__GLIBC__) || defined(STARPU_HAVE_HWLOC)
 		int id = workerid_array[i];
 #ifdef __GLIBC__
 #ifdef CPU_OR
@@ -143,6 +144,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 				combined_worker->hwloc_cpu_set,
 				config->workers[id].initial_hwloc_cpu_set);
 #endif
+#endif
 	}
 
 	return new_workerid;

+ 9 - 3
src/core/debug.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,6 +25,12 @@ static pthread_mutex_t logfile_mutex = PTHREAD_MUTEX_INITIALIZER;
 static FILE *logfile;
 #endif
 
+int _starpu_use_fxt
+#ifdef STARPU_USE_FXT
+	= 1
+#endif
+	;
+
 void _starpu_open_debug_logfile(void)
 {
 #ifdef STARPU_VERBOSE
@@ -49,7 +55,7 @@ void _starpu_close_debug_logfile(void)
 #endif
 }
 
-void _starpu_print_to_logfile(const char *format __attribute__((unused)), ...)
+void _starpu_print_to_logfile(const char *format STARPU_ATTRIBUTE_UNUSED, ...)
 {
 #ifdef STARPU_VERBOSE
 	va_list args;

+ 4 - 1
src/core/debug.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -34,4 +34,7 @@ void _starpu_close_debug_logfile(void);
 /* Write into StarPU's log file */
 void _starpu_print_to_logfile(const char *format, ...);
 
+/* Tell gdb whether FXT is compiled in or not */
+extern int _starpu_use_fxt;
+
 #endif // __DEBUG_H__

+ 5 - 3
src/core/dependencies/cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -111,7 +111,9 @@ void _starpu_notify_cg(starpu_cg_t *cg)
 	
 				tag_successors->ndeps_completed++;
 
+#ifdef STARPU_DEVEL
 #warning FIXME: who locks this?
+#endif
 				if ((tag->state == STARPU_BLOCKED) &&
 					(tag_successors->ndeps == tag_successors->ndeps_completed)) {
 					/* reset the counter so that we can reuse the completion group */
@@ -158,7 +160,7 @@ void _starpu_notify_cg_list(struct starpu_cg_list_s *successors)
 		struct starpu_cg_s *cg = successors->succ[succ];
 		STARPU_ASSERT(cg);
 
-		struct starpu_tag_s *cgtag;
+		struct starpu_tag_s *cgtag = NULL;
 
 		unsigned cg_type = cg->cg_type;
 

+ 10 - 4
src/core/dependencies/implicit_data_deps.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -182,7 +182,7 @@ static void disable_last_writer_callback(void *cl_arg)
  * */
 /* NB : handle->sequential_consistency_mutex must be hold by the caller */
 void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						   starpu_data_handle handle, starpu_access_mode mode)
+						starpu_data_handle handle, starpu_access_mode mode)
 {
 	STARPU_ASSERT(!(mode & STARPU_SCRATCH));
         _STARPU_LOG_IN();
@@ -233,7 +233,7 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 	
 		}
 		else {
-			_STARPU_DEP_DEBUG("R %p\n", handle);
+			_STARPU_DEP_DEBUG("R %p %d -> %d\n", handle, previous_mode, mode);
 			/* Add a reader, after a writer or a reader. */
 			STARPU_ASSERT(pre_sync_task);
 			STARPU_ASSERT(post_sync_task);
@@ -255,7 +255,10 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 				new_sync_task->cl = NULL;
 				new_sync_task->callback_func = disable_last_writer_callback;
 				new_sync_task->callback_arg = handle;
-				
+#ifdef STARPU_USE_FXT
+				_starpu_get_job_associated_to_task(new_sync_task)->model_name = "sync_task_redux";
+#endif
+
 				_starpu_add_writer_after_readers(handle, new_sync_task, new_sync_task);
 
 				starpu_task_submit(new_sync_task);
@@ -461,6 +464,9 @@ int _starpu_data_wait_until_available(starpu_data_handle handle, starpu_access_m
 		sync_task = starpu_task_create();
 		sync_task->detach = 0;
 		sync_task->destroy = 1;
+#ifdef STARPU_USE_FXT
+		_starpu_get_job_associated_to_task(sync_task)->model_name = "sync_task";
+#endif
 
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */

+ 1 - 1
src/core/dependencies/implicit_data_deps.h

@@ -22,7 +22,7 @@
 #include <common/config.h>
 
 void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						   starpu_data_handle handle, starpu_access_mode mode);
+						starpu_data_handle handle, starpu_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle handle);
 

+ 1 - 3
src/core/dependencies/tags.c

@@ -328,10 +328,8 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
 	PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
 
-	while (!cg->succ.succ_apps.completed){
-	  //	  printf("cond wait\n");
+	while (!cg->succ.succ_apps.completed)
 		PTHREAD_COND_WAIT(&cg->succ.succ_apps.cg_cond, &cg->succ.succ_apps.cg_mutex);
-	}
 
 	PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
 

+ 14 - 6
src/core/jobs.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +24,7 @@
 #include <common/utils.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
+#include <starpu_top.h>
 
 size_t _starpu_job_get_data_size(starpu_job_t j)
 {
@@ -68,7 +69,7 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 	job->terminated = 0;
 
 #ifndef STARPU_USE_FXT
-	if (_starpu_bound_recording)
+	if (_starpu_bound_recording || starpu_top_status_get())
 #endif
 		job->job_id = STARPU_ATOMIC_ADD(&job_cnt, 1);
 #ifdef STARPU_USE_FXT
@@ -227,7 +228,6 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 		/* We reuse the same job structure */
 		int ret = _starpu_submit_job(j, 1);
 		STARPU_ASSERT(!ret);
-		printf("did not decrement\n");
 	}	
 	else {
 		_starpu_decrement_nsubmitted_tasks();
@@ -270,7 +270,9 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
 	return ret;
 }
 
+#ifdef STARPU_DEVEL
 #warning TODO remove the job_is_already_locked parameter
+#endif
 static unsigned _starpu_not_all_task_deps_are_fulfilled(starpu_job_t j, unsigned job_is_already_locked)
 {
 	unsigned ret;
@@ -304,7 +306,9 @@ static unsigned _starpu_not_all_task_deps_are_fulfilled(starpu_job_t j, unsigned
  *	In order, we enforce tag, task and data dependencies. The task is
  *	passed to the scheduler only once all these constraints are fulfilled.
  */
+#ifdef STARPU_DEVEL
 #warning TODO remove the job_is_already_locked parameter
+#endif
 unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_already_locked)
 {
 	unsigned ret;
@@ -335,7 +339,9 @@ unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_alrea
 }
 
 /* Tag deps are already fulfilled */
+#ifdef STARPU_DEVEL
 #warning TODO remove the job_is_already_locked parameter
+#endif
 unsigned _starpu_enforce_deps_starting_from_task(starpu_job_t j, unsigned job_is_already_locked)
 {
 	unsigned ret;
@@ -372,6 +378,7 @@ int _starpu_push_local_task(struct starpu_worker_s *worker, struct starpu_task *
 		return -ENODEV;
 
 	PTHREAD_MUTEX_LOCK(worker->sched_mutex);
+
 	if (back)
 		starpu_task_list_push_back(&worker->local_tasks, task);
 	else
@@ -393,10 +400,11 @@ const char *_starpu_get_model_name(starpu_job_t j)
             && task->cl->model
             && task->cl->model->symbol)
                 return task->cl->model->symbol;
-#ifdef STARPU_USE_FXT
         else {
+#ifdef STARPU_USE_FXT
                 return j->model_name;
-        }
+#else
+                return NULL;
 #endif
-        return NULL;
+        }
 }

+ 2 - 1
src/core/jobs.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,6 +35,7 @@
 #include <datawizard/datawizard.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/errorcheck.h>
+#include <common/barrier.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>

+ 29 - 10
src/core/perfmodel/perfmodel.c

@@ -72,14 +72,6 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum
 	double exp = -1.0;
 	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
 	
-	if (!model->is_loaded)
-	{
-		model->benchmarking = _starpu_get_calibrate_flag();
-		
-		_starpu_register_model(model);
-		model->is_loaded = 1;
-	}
-
 	per_arch_cost_model = model->per_arch[arch].cost_model;
 
 	if (per_arch_cost_model)
@@ -134,6 +126,33 @@ static double common_task_expected_perf(struct starpu_perfmodel_t *model, enum s
 	return -1.0;
 }
 
+void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
+{
+	if (!model || model->is_loaded)
+		return;
+
+	switch (model->type) {
+		case STARPU_PER_ARCH:
+		case STARPU_COMMON:
+			break;
+
+		case STARPU_HISTORY_BASED:
+		case STARPU_NL_REGRESSION_BASED:
+			_starpu_load_history_based_model(model, 1);
+			break;
+
+		case STARPU_REGRESSION_BASED:
+			_starpu_load_history_based_model(model, 0);
+			break;
+
+		default:
+			STARPU_ABORT();
+	}
+
+	_starpu_register_model(model);
+	model->is_loaded = 1;
+}
+
 static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
 {
 	if (model) {
@@ -146,8 +165,8 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 				return common_task_expected_perf(model, arch, task);
 
 			case STARPU_HISTORY_BASED:
-			    return _starpu_history_based_job_expected_perf(model, arch, j);
-			  
+				return _starpu_history_based_job_expected_perf(model, arch, j);
+
 			case STARPU_REGRESSION_BASED:
 				return _starpu_regression_based_job_expected_perf(model, arch, j);
 

+ 2 - 2
src/core/perfmodel/perfmodel.h

@@ -93,6 +93,8 @@ void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j);
 void _starpu_register_model(struct starpu_perfmodel_t *model);
+void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history);
+void _starpu_load_perfmodel(struct starpu_perfmodel_t *model);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
@@ -111,8 +113,6 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);
 
-enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
-
 #if defined(STARPU_USE_CUDA)
 int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 #endif

+ 71 - 36
src/core/perfmodel/perfmodel_bus.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,6 +27,7 @@
 #include <math.h>
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <common/config.h>
 #include <core/workers.h>
@@ -65,6 +66,7 @@ static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+static size_t cuda_size = SIZE;
 #endif
 #ifdef STARPU_USE_OPENCL
 static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
@@ -98,10 +100,16 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 
+        /* Get the maximum size which can be allocated on the device */
+	struct cudaDeviceProp prop;
+	cudaError_t cures;
+	cures = cudaGetDeviceProperties(&prop, dev);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+        if (cuda_size > prop.totalGlobalMem/4) cuda_size = prop.totalGlobalMem/4;
 
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
-	cudaMalloc((void **)&d_buffer, SIZE);
+	cudaMalloc((void **)&d_buffer, cuda_size);
 	assert(d_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -110,7 +118,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
-	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
+	cudaHostAlloc((void **)&h_buffer, cuda_size, 0);
 	assert(h_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -118,8 +126,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 
 	/* Fill them */
-	memset(h_buffer, 0, SIZE);
-	cudaMemset(d_buffer, 0, SIZE);
+	memset(h_buffer, 0, cuda_size);
+	cudaMemset(d_buffer, 0, cuda_size);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -134,7 +142,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	{
-		cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
+		cudaMemcpy(d_buffer, h_buffer, cuda_size, cudaMemcpyHostToDevice);
 		cudaThreadSynchronize();
 	}
 	gettimeofday(&end, NULL);
@@ -146,7 +154,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	{
-		cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
+		cudaMemcpy(h_buffer, d_buffer, cuda_size, cudaMemcpyDeviceToHost);
 		cudaThreadSynchronize();
 	}
 	gettimeofday(&end, NULL);
@@ -183,7 +191,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
         starpu_opencl_get_device(dev, &device);
 	err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-        if (opencl_size > (size_t)maxMemAllocSize) opencl_size = maxMemAllocSize;
+        if (opencl_size > (size_t)maxMemAllocSize/4) opencl_size = maxMemAllocSize/4;
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -378,12 +386,12 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
-		_STARPU_DISP("BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
+		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 
-	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
+	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %u\n", dev, best_cpu);
 #endif
 
 	/* The results are sorted in a decreasing order, so that the best
@@ -396,7 +404,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 static void benchmark_all_gpu_devices(void)
 {
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
-	int i, ret;
+	int i;
 
 	_STARPU_DEBUG("Benchmarking the speed of the bus\n");
 
@@ -409,6 +417,7 @@ static void benchmark_all_gpu_devices(void)
 #ifdef __linux__
 	/* Save the current cpu binding */
 	cpu_set_t former_process_affinity;
+	int ret;
 	ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
 	if (ret)
 	{
@@ -466,7 +475,11 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 	strncat(path, type, maxlen);
 
 	char hostname[32];
-	gethostname(hostname, 32);
+	char *forced_hostname = getenv("STARPU_HOSTNAME");
+	if (forced_hostname && forced_hostname[0])
+		snprintf(hostname, sizeof(hostname), forced_hostname);
+	else
+		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, hostname, maxlen);
 }
@@ -574,7 +587,7 @@ static void write_bus_affinity_file_content(void)
 
         fprintf(f, "# GPU\t");
 	for (cpu = 0; cpu < ncpus; cpu++)
-		fprintf(f, "CPU%d\t", cpu);
+		fprintf(f, "CPU%u\t", cpu);
 	fprintf(f, "\n");
 
 #ifdef STARPU_USE_CUDA
@@ -656,7 +669,7 @@ static void get_latency_path(char *path, size_t maxlen)
 	get_bus_path("latency", path, maxlen);
 }
 
-static void load_bus_latency_file_content(void)
+static int load_bus_latency_file_content(void)
 {
 	int n;
 	unsigned src, dst;
@@ -675,17 +688,29 @@ static void load_bus_latency_file_content(void)
 		{
 			double latency;
 
-			n = fscanf(f, "%lf\t", &latency);
-			STARPU_ASSERT(n == 1);
+			n = fscanf(f, "%lf", &latency);
+			if (n != 1) {
+				fclose(f);
+				return 0;
+			}
+			n = getc(f);
+			if (n != '\t') {
+				fclose(f);
+				return 0;
+			}
 
 			latency_matrix[src][dst] = latency;
 		}
 
-		n = fscanf(f, "\n");
-		STARPU_ASSERT(n == 0);
+		n = getc(f);
+		if (n != '\n') {
+			fclose(f);
+			return 0;
+		}
 	}
 
 	fclose(f);
+	return 1;
 }
 
 static void write_bus_latency_file_content(void)
@@ -735,7 +760,7 @@ static void write_bus_latency_file_content(void)
                                 latency = ((src && dst)?2000.0:500.0);
 			}
 
-			fprintf(f, "%lf\t", latency);
+			fprintf(f, "%f\t", latency);
 		}
 
 		fprintf(f, "\n");
@@ -760,13 +785,12 @@ static void load_bus_latency_file(void)
 	get_latency_path(path, 256);
 
 	res = access(path, F_OK);
-	if (res)
+	if (res || !load_bus_latency_file_content())
 	{
-		/* File does not exist yet */
+		/* File does not exist yet or is bogus */
 		generate_bus_latency_file();
 	}
 
-	load_bus_latency_file_content();
 }
 
 
@@ -778,7 +802,7 @@ static void get_bandwidth_path(char *path, size_t maxlen)
 	get_bus_path("bandwidth", path, maxlen);
 }
 
-static void load_bus_bandwidth_file_content(void)
+static int load_bus_bandwidth_file_content(void)
 {
 	int n;
 	unsigned src, dst;
@@ -803,17 +827,30 @@ static void load_bus_bandwidth_file_content(void)
 		{
 			double bandwidth;
 
-			n = fscanf(f, "%lf\t", &bandwidth);
-			STARPU_ASSERT(n == 1);
+			n = fscanf(f, "%lf", &bandwidth);
+			if (n != 1) {
+				fprintf(stderr,"didn't get a number\n");
+				fclose(f);
+				return 0;
+			}
+			n = getc(f);
+			if (n != '\t') {
+				fclose(f);
+				return 0;
+			}
 
 			bandwidth_matrix[src][dst] = bandwidth;
 		}
 
-		n = fscanf(f, "\n");
-		STARPU_ASSERT(n == 0);
+		n = getc(f);
+		if (n != '\n') {
+			fclose(f);
+			return 0;
+		}
 	}
 
 	fclose(f);
+	return 1;
 }
 
 static void write_bus_bandwidth_file_content(void)
@@ -858,7 +895,7 @@ static void write_bus_bandwidth_file_content(void)
 				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
                                 time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
 				timing =time_src_to_ram + time_ram_to_dst;
-				bandwidth = 1.0*SIZE/timing;
+				bandwidth = 1.0*cuda_size/timing;
 #endif
 #ifdef STARPU_USE_OPENCL
                                 if (src > ncuda)
@@ -875,7 +912,7 @@ static void write_bus_bandwidth_file_content(void)
 			        bandwidth = 0.0;
 			}
 
-			fprintf(f, "%lf\t", bandwidth);
+			fprintf(f, "%f\t", bandwidth);
 		}
 
 		fprintf(f, "\n");
@@ -900,13 +937,11 @@ static void load_bus_bandwidth_file(void)
 	get_bandwidth_path(path, 256);
 
 	res = access(path, F_OK);
-	if (res)
+	if (res || !load_bus_bandwidth_file_content())
 	{
-		/* File does not exist yet */
+		/* File does not exist yet or is bogus */
 		generate_bus_bandwidth_file();
 	}
-
-	load_bus_bandwidth_file_content();
 }
 
 /*
@@ -961,17 +996,17 @@ static void check_bus_config_file()
 
                 // Checking if both configurations match
                 if (read_cpus != ncpus) {
-			fprintf(stderr, "Current configuration does not match the performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
+			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
                 }
                 else if (read_cuda != ncuda) {
-                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
+                        fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
                 }
                 else if (read_opencl != nopencl) {
-                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
+                        fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
                 }

+ 19 - 30
src/core/perfmodel/perfmodel_history.c

@@ -282,7 +282,11 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 	strncat(path, model->symbol, maxlen);
 	
 	char hostname[32];
-	gethostname(hostname, 32);
+	char *forced_hostname = getenv("STARPU_HOSTNAME");
+	if (forced_hostname && forced_hostname[0])
+		snprintf(hostname, sizeof(hostname), forced_hostname);
+	else
+		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, hostname, maxlen);
 	strncat(path, ".", maxlen);
@@ -326,7 +330,11 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 	strncat(path, model->symbol, maxlen);
 	
 	char hostname[32];
-	gethostname(hostname, 32);
+	char *forced_hostname = getenv("STARPU_HOSTNAME");
+	if (forced_hostname && forced_hostname[0])
+		snprintf(hostname, sizeof(hostname), forced_hostname);
+	else
+		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, hostname, maxlen);
 }
@@ -392,7 +400,7 @@ void _starpu_deinitialize_registered_performance_models(void)
  * was loaded or not (this is very likely to have been already loaded). If the
  * model was not loaded yet, we take the lock in write mode, and if the model
  * is still not loaded once we have the lock, we do load it.  */
-static void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
+void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
 {
 
 	STARPU_ASSERT(model);
@@ -496,15 +504,8 @@ int starpu_list_models(void)
         dp = opendir(path);
         if (dp != NULL) {
                 while ((ep = readdir(dp))) {
-#ifdef DT_REG
-                        if (ep->d_type == DT_REG)
-#else
-			if (strcmp(ep->d_name, ".")
-			 && strcmp(ep->d_name, ".."))
-#endif
-			{
+                        if (strcmp(ep->d_name, ".") && strcmp(ep->d_name, ".."))
                                 fprintf(stdout, "file: <%s>\n", ep->d_name);
-                        }
                 }
                 closedir (dp);
                 return 0;
@@ -608,12 +609,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *mod
 	size_t size = _starpu_job_get_data_size(j);
 	struct starpu_regression_model_t *regmodel;
 
-	load_history_based_model(model, 0);
-
 	regmodel = &model->per_arch[arch].regression;
 
 	if (regmodel->valid)
-		exp = regmodel->alpha*pow(size, regmodel->beta);
+                exp = regmodel->alpha*pow((double)size, regmodel->beta);
 
 	return exp;
 }
@@ -624,12 +623,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	size_t size = _starpu_job_get_data_size(j);
 	struct starpu_regression_model_t *regmodel;
 
-	load_history_based_model(model, 0);
-
 	regmodel = &model->per_arch[arch].regression;
 
 	if (regmodel->nl_valid)
-		exp = regmodel->a*pow(size, regmodel->b) + regmodel->c;
+		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 
 	return exp;
 }
@@ -641,19 +638,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 	struct starpu_history_entry_t *entry;
 	struct starpu_htbl32_node_s *history;
 
-	load_history_based_model(model, 1);
-
-	if (STARPU_UNLIKELY(!j->footprint_is_computed))
-		_starpu_compute_buffers_footprint(j);
-		
-	uint32_t key = j->footprint;
+	uint32_t key = _starpu_compute_buffers_footprint(j);
 
 	per_arch_model = &model->per_arch[arch];
 
 	history = per_arch_model->history;
 	if (!history)
 		return -1.0;
-    
 
 	PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	entry = _starpu_htbl_search_32(history, key);
@@ -662,17 +653,15 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 	exp = entry?entry->mean:-1.0;
 
 	if (entry && entry->nsample < STARPU_CALIBRATION_MINIMUM)
-	  {
 		/* TODO: report differently if we've scheduled really enough
 		 * of that task and the scheduler should perhaps put it aside */
 		/* Not calibrated enough */
 		return -1.0;
-	  }
 
 	return exp;
 }
 
-void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid __attribute__((unused)), double measured)
+void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured)
 {
 	if (model)
 	{
@@ -682,7 +671,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
-			uint32_t key = j->footprint;
+			uint32_t key = _starpu_compute_buffers_footprint(j);
 			struct starpu_history_entry_t *entry;
 
 			struct starpu_htbl32_node_s *history;
@@ -738,7 +727,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 			/* update the regression model */
 			size_t job_size = _starpu_job_get_data_size(j);
 			double logy, logx;
-			logx = log(job_size);
+			logx = log((double)job_size);
 			logy = log(measured);
 
 			reg_model->sumlnx += logx;
@@ -767,7 +756,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 
 		STARPU_ASSERT(j->footprint_is_computed);
 
-		fprintf(debug_file, "0x%x\t%lu\t%lf\t%lf\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, cpuid);
+		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, cpuid);
 		unsigned i;
 			
 		for (i = 0; i < task->cl->nbuffers; i++)

+ 12 - 16
src/core/sched_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,8 +22,7 @@
 #include <common/utils.h>
 #include <core/sched_policy.h>
 #include <profiling/profiling.h>
-
-//static struct starpu_sched_policy_s policy;
+#include <common/barrier.h>
 
 static int use_prefetch = 0;
 
@@ -38,7 +37,6 @@ int starpu_get_prefetch_flag(void)
 
 extern struct starpu_sched_policy_s _starpu_sched_ws_policy;
 extern struct starpu_sched_policy_s _starpu_sched_prio_policy;
-extern struct starpu_sched_policy_s _starpu_sched_no_prio_policy;
 extern struct starpu_sched_policy_s _starpu_sched_random_policy;
 extern struct starpu_sched_policy_s _starpu_sched_dm_policy;
 extern struct starpu_sched_policy_s _starpu_sched_dmda_policy;
@@ -49,12 +47,9 @@ extern struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy;
 extern struct starpu_sched_policy_s _starpu_sched_pgreedy_policy;
 extern struct starpu_sched_policy_s heft_policy;
 
-#define NPREDEFINED_POLICIES	12
-
-static struct starpu_sched_policy_s *predefined_policies[NPREDEFINED_POLICIES] = {
+static struct starpu_sched_policy_s *predefined_policies[] = {
 	&_starpu_sched_ws_policy,
 	&_starpu_sched_prio_policy,
-	&_starpu_sched_no_prio_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,
 	&heft_policy,
@@ -94,9 +89,8 @@ static void load_sched_policy(struct starpu_sched_policy_s *sched_policy, struct
 	policy->init_sched = sched_policy->init_sched;
 	policy->deinit_sched = sched_policy->deinit_sched;
 	policy->push_task = sched_policy->push_task;
-	policy->push_prio_task = sched_policy->push_prio_task;
 	policy->pop_task = sched_policy->pop_task;
-        policy->post_exec_hook = sched_policy->post_exec_hook;
+    policy->post_exec_hook = sched_policy->post_exec_hook;
 	policy->pop_every_task = sched_policy->pop_every_task;
 	policy->push_task_notify = sched_policy->push_task_notify;
 	policy->policy_name = sched_policy->policy_name;
@@ -110,7 +104,7 @@ static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *pol
 		return NULL;
 
 	unsigned i;
-	for (i = 0; i < NPREDEFINED_POLICIES; i++)
+	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 	{
 		struct starpu_sched_policy_s *p;
 		p = predefined_policies[i];
@@ -122,6 +116,7 @@ static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *pol
 			}
 		}
 	}
+	fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
 
 	/* nothing was found */
 	return NULL;
@@ -135,7 +130,7 @@ static void display_sched_help_message(void)
 
 		/* display the description of all predefined policies */
 		unsigned i;
-		for (i = 0; i < NPREDEFINED_POLICIES; i++)
+		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 		{
 			struct starpu_sched_policy_s *p;
 			p = predefined_policies[i];
@@ -187,7 +182,6 @@ void _starpu_init_sched_policy(struct starpu_machine_config_s *config, struct st
 	use_prefetch = starpu_get_env_number("STARPU_PREFETCH");
 	if (use_prefetch == -1)
 		use_prefetch = 1;
-  
 
 	/* By default, we don't calibrate */
 	unsigned do_calibrate = 0;
@@ -228,8 +222,8 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	int is_basic_worker = (workerid < nbasic_workers);
 
 	unsigned memory_node; 
-	struct starpu_worker_s *worker;
-	struct starpu_combined_worker_s *combined_worker;
+	struct starpu_worker_s *worker = NULL;
+	struct starpu_combined_worker_s *combined_worker = NULL;
 
 	if (is_basic_worker)
 	{
@@ -447,3 +441,5 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
 
 	return _starpu_push_local_task(worker, task, back);
 }
+
+

+ 9 - 1
src/core/task.c

@@ -246,6 +246,7 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
                         _STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
                 }
+		assert(task->cl->nbuffers <= STARPU_NMAXBUFS);
 
 		/* In case we require that a task should be explicitely
 		 * executed on a specific worker, we make sure that the worker
@@ -256,6 +257,12 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
                 }
 
 		_starpu_detect_implicit_data_deps(task);
+
+		if (task->cl->model)
+			_starpu_load_perfmodel(task->cl->model);
+
+		if (task->cl->power_model)
+			_starpu_load_perfmodel(task->cl->power_model);
 	}
 
 	/* If profiling is activated, we allocate a structure to store the
@@ -268,7 +275,8 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
 	/* The task is considered as block until we are sure there remains not
 	 * dependency. */
 	task->status = STARPU_TASK_BLOCKED;
-	
+
+
 	if (profiling)
 		starpu_clock_gettime(&info->submit_time);
 

+ 22 - 9
src/core/topology.c

@@ -269,7 +269,7 @@ unsigned _starpu_topology_get_nhwcpu(struct starpu_machine_config_s *config)
 static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 				struct starpu_conf *user_conf)
 {
-	int explicitval __attribute__((unused));
+	int explicitval STARPU_ATTRIBUTE_UNUSED;
 	unsigned use_accelerator = 0;
 
 	int i;
@@ -440,23 +440,26 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 /* we put the CPU section after the accelerator : in case there was an
  * accelerator found, we devote one cpu */
 #ifdef STARPU_USE_CPU
-	explicitval = -1;
 	if (user_conf && (user_conf->ncpus != -1)) {
 		explicitval = user_conf->ncpus;
 	}
 	else {
 		explicitval = starpu_get_env_number("STARPU_NCPUS");
 	}
+
 	if (explicitval < 0) {
-		unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus;
+		unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus + topology->nopenclgpus;
 		long avail_cpus = topology->nhwcpus - (use_accelerator?already_busy_cpus:0);
-		topology->ncpus = STARPU_MIN(avail_cpus, STARPU_NMAXCPUS);
+		if (avail_cpus < 0)
+			avail_cpus = 0;
+		topology->ncpus = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
 	} else {
 		/* use the specified value */
 		topology->ncpus = (unsigned)explicitval;
-		STARPU_ASSERT(topology->ncpus <= STARPU_NMAXCPUS);
+		STARPU_ASSERT(topology->ncpus <= STARPU_MAXCPUS);
 	}
 	STARPU_ASSERT(topology->ncpus + topology->nworkers <= STARPU_NMAXWORKERS);
+
 	unsigned cpu;
 	for (cpu = 0; cpu < topology->ncpus; cpu++)
 	{
@@ -596,7 +599,7 @@ static inline int _starpu_get_next_bindid(struct starpu_machine_config_s *config
 	return (int)topology->workers_bindid[i];
 }
 
-void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config __attribute__((unused)), unsigned cpuid)
+void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config STARPU_ATTRIBUTE_UNUSED, unsigned cpuid)
 {
 #ifdef STARPU_HAVE_HWLOC
 	int ret;
@@ -649,7 +652,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
 	/* note that even if the CPU cpu are not used, we always have a RAM node */
 	/* TODO : support NUMA  ;) */
-	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM);
+	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM, -1);
 
 	/* We will store all the busid of the different (src, dst) combinations
 	 * in a matrix which we initialize here. */
@@ -688,7 +691,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 					npreferred = config->topology.nhwcpus;
 				}
 				is_a_set_of_accelerators = 0;
-				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
+				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM, workerarg->devid);
 
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(memory_node, 0);
@@ -704,7 +707,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 					npreferred = config->topology.nhwcpus;
 				}
 				is_a_set_of_accelerators = 0;
-				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM, workerarg->devid);
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(memory_node, 0);
 				break;
@@ -774,6 +777,16 @@ void _starpu_destroy_topology(struct starpu_machine_config_s *config __attribute
 	/* cleanup StarPU internal data structures */
 	_starpu_deinit_memory_nodes();
 
+	unsigned worker;
+	for (worker = 0; worker < config->topology.nworkers; worker++)
+	{
+#ifdef STARPU_HAVE_HWLOC
+		struct starpu_worker_s *workerarg = &config->workers[worker];
+		hwloc_bitmap_free(workerarg->initial_hwloc_cpu_set);
+		hwloc_bitmap_free(workerarg->current_hwloc_cpu_set);
+#endif
+	}
+
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_topology_destroy(config->topology.hwtopology);
 #endif

+ 57 - 10
src/core/workers.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -155,7 +155,7 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 
 		workerarg->worker_size = 1;
 		workerarg->combined_workerid = workerarg->workerid;
-		workerarg->current_rank = 1;
+		workerarg->current_rank = 0;
 
 		/* mutex + cond only for the local list */
 		/* we have a single local list */
@@ -175,7 +175,7 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 	
 		workerarg->status = STATUS_INITIALIZING;
 
-		_STARPU_DEBUG("initialising worker %d\n", worker);
+		_STARPU_DEBUG("initialising worker %u\n", worker);
 
 		_starpu_init_worker_queue(workerarg);
 
@@ -297,8 +297,10 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
 
+	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
+
 	return 0;
-};
+}
 
 int starpu_init(struct starpu_conf *user_conf)
 {
@@ -331,6 +333,8 @@ int starpu_init(struct starpu_conf *user_conf)
 	
 	_starpu_open_debug_logfile();
 
+	_starpu_data_interface_init();
+
 	_starpu_timing_init();
 
 	_starpu_profiling_init();
@@ -364,8 +368,6 @@ int starpu_init(struct starpu_conf *user_conf)
 	else
 	  _starpu_create_sched_ctx(user_conf->sched_policy_name, NULL, -1, 1, "init");
 
-	//_starpu_init_sched_policy(&config, &sched_ctx);
-
 	_starpu_initialize_registered_performance_models();
 
 	/* Launch "basic" workers (ie. non-combined workers) */
@@ -386,7 +388,7 @@ int starpu_init(struct starpu_conf *user_conf)
 
 static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 {
-	int status __attribute__((unused));
+	int status STARPU_ATTRIBUTE_UNUSED;
 	unsigned workerid;
 
 	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
@@ -426,7 +428,7 @@ static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 #endif
 			}
 		}
-		//		worker->status = STATUS_JOINED;
+
 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
 		starpu_job_list_delete(worker->terminated_jobs);
 	}
@@ -437,7 +439,7 @@ unsigned _starpu_machine_is_running(void)
 	return config.running;
 }
 
-unsigned _starpu_worker_can_block(unsigned memnode __attribute__((unused)))
+unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	return 0;
@@ -507,6 +509,8 @@ void starpu_shutdown(void)
 	_starpu_stop_fxt_profiling();
 #endif
 
+	_starpu_data_interface_shutdown();
+
 	_starpu_close_debug_logfile();
 
 	PTHREAD_MUTEX_LOCK(&init_mutex);
@@ -521,6 +525,27 @@ unsigned starpu_worker_get_count(void)
 	return config.topology.nworkers;
 }
 
+int starpu_worker_get_count_by_type(enum starpu_archtype type)
+{
+	switch (type)
+	{
+		case STARPU_CPU_WORKER:
+			return config.topology.ncpus;
+
+		case STARPU_CUDA_WORKER:
+			return config.topology.ncudagpus;
+
+		case STARPU_OPENCL_WORKER:
+			return config.topology.nopenclgpus;
+
+		case STARPU_GORDON_WORKER:
+			return config.topology.ngordon_spus;
+
+		default:
+			return -EINVAL;
+	}
+}
+
 unsigned starpu_combined_worker_get_count(void)
 {
 	return config.topology.ncombinedworkers;
@@ -644,6 +669,28 @@ enum starpu_archtype starpu_worker_get_type(int id)
 	return config.workers[id].arch;
 }
 
+int starpu_worker_get_ids_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+{
+	unsigned nworkers = starpu_worker_get_count();
+
+	int cnt = 0;
+
+	unsigned id;
+	for (id = 0; id < nworkers; id++)
+	{
+		if (starpu_worker_get_type(id) == type)
+		{
+			/* Perhaps the array is too small ? */
+			if (cnt >= maxsize)
+				return -ERANGE;
+
+			workerids[cnt++] = id;
+		}
+	}
+
+	return cnt;
+}
+
 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 {
 	char *name = config.workers[id].name;

+ 2 - 2
src/core/workers.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -76,7 +76,7 @@ struct starpu_worker_s {
 	unsigned worker_is_running;
 	unsigned worker_is_initialized;
 	starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
-	char name[32];
+	char name[48];
 
 	struct starpu_sched_ctx **sched_ctx;
 	unsigned nctxs; /* the no of contexts a worker belongs to*/

+ 256 - 142
src/datawizard/coherency.c

@@ -22,29 +22,6 @@
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/profiling.h>
 
-uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_node) 
-{
-	/* in case one of the node is a GPU, it needs to perform the transfer,
-	 * if both of them are GPU, it's a bit more complicated */
-
-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
-
-	/* we do not handle GPU->GPU transfers yet ! */
-	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
-
-	if (src_is_a_gpu)
-		return src_node;
-
-	if (dst_is_a_gpu)
-		return dst_node;
-
-	/* otherwise perform it locally, since we should be on a "sane" arch
-	 * where anyone can do the transfer. NB: in StarPU this should actually never
-	 * happen */
-	return _starpu_get_local_memory_node();
-}
-
 uint32_t _starpu_select_src_node(starpu_data_handle handle)
 {
 	unsigned src_node = 0;
@@ -77,11 +54,10 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
 			/* however GPU are expensive sources, really !
 			 * 	other should be ok */
-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
-				break;
-			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
-				break;
-
+		 
+			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
+				break ;
+		 
 			/* XXX do a better algorithm to distribute the memory copies */
 			/* TODO : use the "requesting_node" as an argument to do so */
 		}
@@ -103,7 +79,8 @@ void _starpu_update_data_state(starpu_data_handle handle,
 	unsigned nnodes = _starpu_get_memory_nodes_count();
 
 	/* the data is present now */
-	requesting_replicate->requested = 0;
+	unsigned requesting_node = requesting_replicate->memory_node;
+	requesting_replicate->requested[requesting_node] = 0;
 
 	if (mode & STARPU_W) {
 		/* the requesting node now has the only valid copy */
@@ -129,6 +106,149 @@ void _starpu_update_data_state(starpu_data_handle handle,
 	}
 }
 
+static int worker_supports_direct_access(unsigned node, unsigned handling_node)
+{
+	if (node == handling_node)
+		return 1;
+
+	int type = _starpu_get_node_kind(node);
+	switch (type)
+	{
+		case STARPU_CUDA_RAM:
+#ifdef HAVE_CUDA_MEMCPY_PEER
+			/* GPUs not always allow direct remote access: if CUDA4
+			 * is enabled, we allow two CUDA devices to communicate. */
+			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
+#else
+			/* Direct GPU-GPU transfers are not allowed in general */
+			return 0;
+#endif
+		case STARPU_OPENCL_RAM:
+			return 0;
+		default:
+			return 1;
+	}
+}
+
+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
+{
+	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
+	 * Perhaps not all data interface provide a direct GPU-GPU transfer
+	 * method ! */
+#ifdef STARPU_USE_CUDA
+	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+	{
+		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+		if (!copy_methods->cuda_to_cuda_async)
+			return 0;
+	}
+#endif
+
+	if (worker_supports_direct_access(src_node, dst_node))
+	{
+		*handling_node = dst_node;
+		return 1;
+	}
+
+	if (worker_supports_direct_access(dst_node, src_node))
+	{
+		*handling_node = src_node;
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Determines the path of a request : each hop is defined by (src,dst) and the
+ * node that handles the hop. The returned value indicates the number of hops,
+ * and the max_len is the maximum number of hops (ie. the size of the
+ * src_nodes, dst_nodes and handling_nodes arrays. */
+static int determine_request_path(starpu_data_handle handle,
+				unsigned src_node, unsigned dst_node,
+				starpu_access_mode mode, int max_len,
+				unsigned *src_nodes, unsigned *dst_nodes,
+				unsigned *handling_nodes)
+{
+	if (!(mode & STARPU_R))
+	{
+		/* The destination node should only allocate the data, no transfer is required */
+		STARPU_ASSERT(max_len >= 1);
+		src_nodes[0] = 0; // ignored
+		dst_nodes[0] = dst_node;
+		handling_nodes[0] = dst_node;
+		return 1;
+	}
+
+	unsigned handling_node;
+	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
+
+	if (!link_is_valid) {
+		/* We need an intermediate hop to implement data staging
+		 * through main memory. */
+		STARPU_ASSERT(max_len >= 2);
+
+		/* XXX we hardcode 0 as the RAM node ... */
+
+		/* GPU -> RAM */
+		src_nodes[0] = src_node;
+		dst_nodes[0] = 0;
+		handling_nodes[0] = src_node;
+
+		/* RAM -> GPU */
+		src_nodes[1] = 0;
+		dst_nodes[1] = dst_node;
+		handling_nodes[1] = dst_node;
+
+		return 2;
+	}
+	else {
+		STARPU_ASSERT(max_len >= 1);
+		
+		src_nodes[0] = src_node;
+		dst_nodes[0] = dst_node;
+		handling_nodes[0] = handling_node;
+
+#ifndef HAVE_CUDA_MEMCPY_PEER
+		STARPU_ASSERT(!(mode & STARPU_R) || _starpu_get_node_kind(src_node) != STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) != STARPU_CUDA_RAM);
+#endif
+
+		return 1;
+	}
+}
+
+/* handle->lock should be taken. r is returned locked. The node parameter
+ * indicate either the source of the request, or the destination for a
+ * write-only request. */
+static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, unsigned node, starpu_access_mode mode)
+{
+	starpu_data_request_t r;
+
+	r = replicate->request[node];
+
+	if (r)
+	{
+		_starpu_spin_lock(&r->lock);
+
+		/* perhaps we need to "upgrade" the request */
+		if (mode & STARPU_R)
+		{
+			/* in case the exisiting request did not imply a memory
+			 * transfer yet, we have to increment the refcnt now
+			 * (so that the source remains valid) */
+			if (!(r->mode & STARPU_R))
+				replicate->refcnt++;
+
+			r->mode |= STARPU_R;
+		}
+
+		if (mode & STARPU_W)
+			r->mode |= STARPU_W;
+	}
+
+	return r;
+}
+
+
 
 /*
  * This function is called when the data is needed on the local node, this
@@ -151,14 +271,33 @@ void _starpu_update_data_state(starpu_data_handle handle,
  */
 
 /* This function is called with handle's header lock taken */
-static starpu_data_request_t create_new_request_to_fetch_data(starpu_data_handle handle,
+starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 				struct starpu_data_replicate_s *dst_replicate,
                                 starpu_access_mode mode, unsigned is_prefetch,
                                 void (*callback_func)(void *), void *callback_arg)
 {
-	starpu_data_request_t r;
 	unsigned requesting_node = dst_replicate->memory_node;
 
+	if (dst_replicate->state != STARPU_INVALID)
+	{
+		/* the data is already available so we can stop */
+		_starpu_update_data_state(handle, dst_replicate, mode);
+		_starpu_msi_cache_hit(requesting_node);
+
+		_starpu_spin_unlock(&handle->header_lock);
+
+		if (callback_func)
+			callback_func(callback_arg);
+
+                _STARPU_LOG_OUT_TAG("data available");
+		return NULL;
+	}
+
+	_starpu_msi_cache_miss(requesting_node);
+
+	/* the only remaining situation is that the local copy was invalid */
+	STARPU_ASSERT(dst_replicate->state == STARPU_INVALID);
+
 	/* find someone who already has the data */
 	uint32_t src_node = 0;
 
@@ -169,74 +308,82 @@ static starpu_data_request_t create_new_request_to_fetch_data(starpu_data_handle
 		STARPU_ASSERT(src_node != requesting_node);
 	}
 
-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
-
-	struct starpu_data_replicate_s *src_replicate = &handle->per_node[src_node];
+	/* We can safely assume that there won't be more than 2 hops in the
+	 * current implementation */
+	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
+	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
+					src_nodes, dst_nodes, handling_nodes);
+	STARPU_ASSERT(nhops <= 4);
 
-	/* we have to perform 2 successive requests for GPU->GPU transfers */
-	if ((mode & STARPU_R) && (src_is_a_gpu && dst_is_a_gpu)) {
-		unsigned reuse_r_src_to_ram;
-		starpu_data_request_t r_src_to_ram;
-		starpu_data_request_t r_ram_to_dst;
+	starpu_data_request_t requests[nhops];
 
-		struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
+	/* Did we reuse a request for that hop ? */
+	int reused_requests[nhops];
 
-		/* XXX we hardcore 0 as the RAM node ... */
-		/* We put a 1 in the number of dependencies because this
-		 * depends on the r_src_to_ram request. */
-		r_ram_to_dst = _starpu_create_data_request(handle, ram_replicate,
-					dst_replicate, requesting_node, mode, 1, is_prefetch);
+	/* Construct an array with a list of requests, possibly reusing existing requests */
+	int hop;
+	for (hop = 0; hop < nhops; hop++)
+	{
+		starpu_data_request_t r;
 
-		if (!is_prefetch)
-			r_ram_to_dst->refcnt++;
+		unsigned hop_src_node = src_nodes[hop];
+		unsigned hop_dst_node = dst_nodes[hop];
+		unsigned hop_handling_node = handling_nodes[hop];
 
-		r_src_to_ram = _starpu_search_existing_data_request(ram_replicate, mode);
+		struct starpu_data_replicate_s *hop_src_replicate;
+		struct starpu_data_replicate_s *hop_dst_replicate;
 
-		reuse_r_src_to_ram = r_src_to_ram?1:0;
+		/* Only the first request is independant */
+		unsigned ndeps = (hop == 0)?0:1;
 
-		if (!r_src_to_ram)
-		{
-			r_src_to_ram = _starpu_create_data_request(handle, src_replicate,
-						ram_replicate, src_node, mode, 0, is_prefetch);
-		}
+		hop_src_replicate = &handle->per_node[hop_src_node];
+		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
 
-		/* we chain both requests */
-		r_src_to_ram->next_req[r_src_to_ram->next_req_count++]= r_ram_to_dst;
+		/* Try to reuse a request if possible */
+		r = _starpu_search_existing_data_request(hop_dst_replicate,
+				(mode & STARPU_R)?hop_src_node:hop_dst_node, mode);
 
-		_starpu_data_request_append_callback(r_ram_to_dst, callback_func, callback_arg);
+		reused_requests[hop] = !!r;
 
-		if (reuse_r_src_to_ram)
-			_starpu_spin_unlock(&r_src_to_ram->lock);
+		if (!r) {
+			/* Create a new request if there was no request to reuse */
+			r = _starpu_create_data_request(handle, hop_src_replicate,
+					hop_dst_replicate, hop_handling_node,
+					mode, ndeps);
+		}
 
-		_starpu_spin_unlock(&handle->header_lock);
+		requests[hop] = r; 
+	}
 
-		/* we only submit the first request, the remaining will be automatically submitted afterward */
-		if (!reuse_r_src_to_ram)
-			_starpu_post_data_request(r_src_to_ram, src_node);
+	/* Chain these requests */
+	for (hop = 0; hop < nhops; hop++)
+	{
+		starpu_data_request_t r;
+		r = requests[hop];
 
-		/* the application only waits for the termination of the last request */
-		r = r_ram_to_dst;
-	}
-	else {
-		/* who will perform that request ? */
-		uint32_t handling_node =
-			_starpu_select_node_to_handle_request(src_node, requesting_node);
+		if (hop != nhops - 1)
+		{
+			if (!reused_requests[hop + 1])
+				r->next_req[r->next_req_count++] = requests[hop + 1];
+		}
+		else
+			_starpu_data_request_append_callback(r, callback_func, callback_arg);
 
-		r = _starpu_create_data_request(handle, src_replicate,
-				dst_replicate, handling_node, mode, 0, is_prefetch);
 
-		_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		if (reused_requests[hop])
+			_starpu_spin_unlock(&r->lock);
+	}
 
-		if (!is_prefetch)
-			r->refcnt++;
+	if (!is_prefetch)
+		requests[nhops - 1]->refcnt++;
 
-		_starpu_spin_unlock(&handle->header_lock);
 
-		_starpu_post_data_request(r, handling_node);
-	}
+	/* we only submit the first request, the remaining will be
+	 * automatically submitted afterward */
+	if (!reused_requests[0])
+		_starpu_post_data_request(requests[0], handling_nodes[0]);
 
-	return r;
+	return requests[nhops - 1];
 }
 
 int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *dst_replicate,
@@ -246,71 +393,23 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_rep
 	uint32_t local_node = _starpu_get_local_memory_node();
         _STARPU_LOG_IN();
 
-	unsigned requesting_node = dst_replicate->memory_node;
-
 	while (_starpu_spin_trylock(&handle->header_lock))
 		_starpu_datawizard_progress(local_node, 1);
 
 	if (!is_prefetch)
 		dst_replicate->refcnt++;
 
-	if (dst_replicate->state != STARPU_INVALID)
-	{
-		/* the data is already available so we can stop */
-		_starpu_update_data_state(handle, dst_replicate, mode);
-		_starpu_msi_cache_hit(requesting_node);
-		_starpu_spin_unlock(&handle->header_lock);
-
-		if (callback_func)
-			callback_func(callback_arg);
-
-                _STARPU_LOG_OUT_TAG("data available");
-		return 0;
-	}
-
-	/* the only remaining situation is that the local copy was invalid */
-	STARPU_ASSERT(dst_replicate->state == STARPU_INVALID);
-
-	_starpu_msi_cache_miss(requesting_node);
-
 	starpu_data_request_t r;
+	r = create_request_to_fetch_data(handle, dst_replicate, mode,
+					is_prefetch, callback_func, callback_arg);
 
-	/* is there already a pending request ? */
-	r = _starpu_search_existing_data_request(dst_replicate, mode);
-	/* at the exit of _starpu_search_existing_data_request the lock is taken if the request existed ! */
-
-	if (!r) {
-		r = create_new_request_to_fetch_data(handle, dst_replicate, mode, is_prefetch, callback_func, callback_arg);
-	}
-	else {
-		/* the lock was taken by _starpu_search_existing_data_request */
-		_starpu_data_request_append_callback(r, callback_func, callback_arg);
-
-		/* there is already a similar request */
-		if (is_prefetch)
-		{
-			_starpu_spin_unlock(&r->lock);
-			_starpu_spin_unlock(&handle->header_lock);
-
-                        _STARPU_LOG_OUT_TAG("similar request");
-                        return 0;
-		}
-
-		r->refcnt++;
-
-		//_starpu_spin_lock(&r->lock);
-		if (r->is_a_prefetch_request)
-		{
-			/* transform that prefetch request into a "normal" request */
-			r->is_a_prefetch_request = 0;
-
-			/* transform that request into the proper access mode (prefetch could be read only) */
-			r->mode |= mode;
-		}
-
-		_starpu_spin_unlock(&r->lock);
-		_starpu_spin_unlock(&handle->header_lock);
-	}
+	/* If no request was created, the handle was already up-to-date on the
+	 * node. In this case, create_request_to_fetch_data has already
+	 * unlocked the header. */
+	if (!r)
+		return 0;
+	
+	_starpu_spin_unlock(&handle->header_lock);
 
 	int ret = is_prefetch?0:_starpu_wait_data_request_completion(r, 1);
         _STARPU_LOG_OUT();
@@ -384,7 +483,10 @@ static void _starpu_set_data_requested_flag_if_needed(struct starpu_data_replica
 //	pthread_spin_lock(&handle->header_lock);
 
 	if (replicate->state == STARPU_INVALID) 
-		replicate->requested = 1;
+	{
+		unsigned dst_node = replicate->memory_node;
+		replicate->requested[dst_node] = 1;
+	}
 
 //	pthread_spin_unlock(&handle->header_lock);
 }
@@ -449,7 +551,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 		if (STARPU_UNLIKELY(ret))
 			goto enomem;
 
-		task->interface[index] = local_replicate->data_interface;
+		task->interfaces[index] = local_replicate->data_interface;
 
 		if (mode & STARPU_REDUX)
 		{
@@ -531,9 +633,21 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_
 // XXX : this is just a hint, so we don't take the lock ...
 //	pthread_spin_lock(&handle->header_lock);
 
-	if (handle->per_node[node].state != STARPU_INVALID 
-		|| handle->per_node[node].requested || handle->per_node[node].request)
-		ret = 1;
+	if (handle->per_node[node].state != STARPU_INVALID)
+	{
+		ret  = 1;
+	}
+	else {
+		unsigned i;
+		unsigned nnodes = _starpu_get_memory_nodes_count();
+
+		for (i = 0; i < nnodes; i++)
+		{
+			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
+				ret = 1;
+		}
+
+	}
 
 //	pthread_spin_unlock(&handle->header_lock);
 

+ 8 - 3
src/datawizard/coherency.h

@@ -74,8 +74,8 @@ LIST_TYPE(starpu_data_replicate,
 	   flag when it assigns a task to a queue, policies which do not
 	   use this hint can simply ignore it.
 	 */
-	uint8_t requested;
-	struct starpu_data_request_s *request;
+	uint8_t requested[STARPU_MAXNODES];
+	struct starpu_data_request_s *request[STARPU_MAXNODES];
 );
 
 struct starpu_data_requester_list_s;
@@ -189,6 +189,7 @@ struct starpu_data_state_t {
 
         /* Used for MPI */
         int rank;
+	int tag;
 };
 
 void _starpu_display_msi_stats(void);
@@ -218,9 +219,13 @@ unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state,
 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
 
 
-uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_node);
 uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
 
+starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
+				struct starpu_data_replicate_s *dst_replicate,
+                                starpu_access_mode mode, unsigned is_prefetch,
+                                void (*callback_func)(void *), void *callback_arg);
+
 void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid);
 void starpu_data_start_reduction_mode(starpu_data_handle handle);
 void starpu_data_end_reduction_mode(starpu_data_handle handle);

+ 68 - 41
src/datawizard/copy_driver.c

@@ -82,7 +82,7 @@ void starpu_wake_all_blocked_workers(void)
 static unsigned communication_cnt = 0;
 #endif
 
-static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req __attribute__((unused)))
+static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req STARPU_ATTRIBUTE_UNUSED)
 {
 	int ret = 0;
 
@@ -108,6 +108,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
+	{
+		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
+		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
+		STARPU_ASSERT(cures == cudaSuccess);
+	}
+#endif
+
 	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
@@ -116,29 +125,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 		break;
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		/* CUBLAS_RAM -> STARPU_CPU_RAM */
-		/* only the proper CUBLAS thread can initiate this ! */
-		if (_starpu_get_local_memory_node() == src_node) {
-			/* only the proper CUBLAS thread can initiate this directly ! */
-			STARPU_ASSERT(copy_methods->cuda_to_ram);
-			if (!req || !copy_methods->cuda_to_ram_async) {
-				/* this is not associated to a request so it's synchronous */
-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
-			}
-			else {
-				cures = cudaEventCreate(&req->async_channel.cuda_event);
-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-				stream = starpu_cuda_get_local_transfer_stream();
-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
-
-				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-			}
+		/* only the proper CUBLAS thread can initiate this directly ! */
+		STARPU_ASSERT(copy_methods->cuda_to_ram);
+		if (!req || !copy_methods->cuda_to_ram_async) {
+			/* this is not associated to a request so it's synchronous */
+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			/* we should not have a blocking call ! */
-			STARPU_ABORT();
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+			stream = starpu_cuda_get_local_transfer_stream();
+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -151,13 +153,35 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			cures = cudaEventCreate(&req->async_channel.cuda_event);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
-			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+			if (STARPU_UNLIKELY(cures != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+		}
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
+		/* CUDA - CUDA transfer */
+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
+		if (!req || !copy_methods->cuda_to_cuda_async) {
+			/* this is not associated to a request so it's synchronous */
+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		}
+		else {
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+			stream = starpu_cuda_get_local_stream();
+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		break;
@@ -172,7 +196,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
 			}
 			else {
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
+				req->async_channel.type = STARPU_OPENCL_RAM;
+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 			}
 		}
 		else {
@@ -189,7 +214,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
+			req->async_channel.type = STARPU_OPENCL_RAM;
+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 		}
 		break;
 #endif
@@ -215,7 +241,7 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	}
 
 	int ret_alloc, ret_copy;
-	unsigned __attribute__((unused)) com_id = 0;
+	unsigned STARPU_ATTRIBUTE_UNUSED com_id = 0;
 
 	unsigned src_node = src_replicate->memory_node;
 	unsigned dst_node = dst_replicate->memory_node;
@@ -263,10 +289,9 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
-					unsigned handling_node)
+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
+	starpu_node_kind kind = async_channel->type;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 	cudaError_t cures;
@@ -275,7 +300,7 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 	switch (kind) {
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			event = (*async_channel).cuda_event;
+			event = (*async_channel).event.cuda_event;
 
 			cures = cudaEventSynchronize(event);
 			if (STARPU_UNLIKELY(cures))
@@ -290,10 +315,10 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 #ifdef STARPU_USE_OPENCL
       case STARPU_OPENCL_RAM:
          {
-                 if ((*async_channel).opencl_event == NULL) STARPU_ABORT();
-                 cl_int err = clWaitForEvents(1, &((*async_channel).opencl_event));
+                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
+                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
                  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                 clReleaseEvent((*async_channel).opencl_event);
+                 clReleaseEvent((*async_channel).event.opencl_event);
          }
          break;
 #endif
@@ -303,10 +328,9 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 	}
 }
 
-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
-					unsigned handling_node)
+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
+	starpu_node_kind kind = async_channel->type;
 	unsigned success;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
@@ -315,11 +339,14 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 	switch (kind) {
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			event = (*async_channel).cuda_event;
+			event = (*async_channel).event.cuda_event;
+			CUresult cures = cudaEventQuery(event);
 
-			success = (cudaEventQuery(event) == cudaSuccess);
+			success = (cures == cudaSuccess);
 			if (success)
 				cudaEventDestroy(event);
+			else if (cures != cudaErrorNotReady)
+				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
 #endif
@@ -327,7 +354,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
       case STARPU_OPENCL_RAM:
          {
             cl_int event_status;
-            cl_event opencl_event = (*async_channel).opencl_event;
+            cl_event opencl_event = (*async_channel).event.opencl_event;
             if (opencl_event == NULL) STARPU_ABORT();
             cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
             if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 8 - 3
src/datawizard/copy_driver.h

@@ -46,7 +46,12 @@ typedef union {
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
 #endif
-} starpu_async_channel;
+} starpu_async_channel_event;
+
+struct starpu_async_channel {
+	starpu_async_channel_event event;
+	starpu_node_kind type;
+};
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
 
@@ -57,6 +62,6 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
 					struct starpu_data_request_s *req,
 					unsigned may_alloc);
 
-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
 #endif // __COPY_DRIVER_H__

+ 27 - 36
src/datawizard/data_request.c

@@ -62,8 +62,22 @@ void _starpu_deinit_data_request_lists(void)
 /* this should be called with the lock r->handle->header_lock taken */
 static void starpu_data_request_destroy(starpu_data_request_t r)
 {
-	STARPU_ASSERT(r->dst_replicate->request == r);
-	r->dst_replicate->request = NULL;
+	unsigned node;
+
+	/* If this is a write only request, then there is no source and we use
+	 * the destination node to cache the request. Otherwise we store the
+	 * pending requests between src and dst. */
+	if (r->mode & STARPU_R)
+	{
+		node = r->src_replicate->memory_node;
+	}
+	else {
+		node = r->dst_replicate->memory_node;
+	}
+
+	STARPU_ASSERT(r->dst_replicate->request[node] == r);
+	r->dst_replicate->request[node] = NULL;
+	//fprintf(stderr, "DESTROY REQ %p (%d) refcnt %d\n", r, node, r->refcnt);
 	starpu_data_request_delete(r);
 }
 
@@ -73,8 +87,7 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 				struct starpu_data_replicate_s *dst_replicate,
 				uint32_t handling_node,
 				starpu_access_mode mode,
-				unsigned ndeps,
-				unsigned is_prefetch)
+				unsigned ndeps)
 {
 	starpu_data_request_t r = starpu_data_request_new();
 
@@ -90,15 +103,21 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 	r->ndeps = ndeps;
 	r->next_req_count = 0;
 	r->callbacks = NULL;
-	r->is_a_prefetch_request = is_prefetch;
 
 	_starpu_spin_lock(&r->lock);
 
-	dst_replicate->request = r;
 	dst_replicate->refcnt++;
 
 	if (mode & STARPU_R)
+	{
+		unsigned src_node = src_replicate->memory_node;
+		dst_replicate->request[src_node] = r;
 		src_replicate->refcnt++;
+	}
+	else {
+		unsigned dst_node = dst_replicate->memory_node;
+		dst_replicate->request[dst_node] = r;
+	}
 
 	r->refcnt = 1;
 
@@ -107,34 +126,6 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 	return r;
 }
 
-/* handle->lock should be taken. r is returned locked */
-starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
-{
-	starpu_data_request_t r = replicate->request;
-
-	if (r)
-	{
-		_starpu_spin_lock(&r->lock);
-
-		/* perhaps we need to "upgrade" the request */
-		if (mode & STARPU_R)
-		{
-			/* in case the exisiting request did not imply a memory
-			 * transfer yet, we have to increment the refcnt now
-			 * (so that the source remains valid) */
-			if (!(r->mode & STARPU_R))
-				replicate->refcnt++;
-
-			r->mode |= STARPU_R;
-		}
-
-		if (mode & STARPU_W)
-			r->mode |= STARPU_W;
-	}
-
-	return r;
-}
-
 int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc)
 {
 	int retval;
@@ -421,11 +412,11 @@ static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force
 		/* wait until the transfer is terminated */
 		if (force)
 		{
-			_starpu_driver_wait_request_completion(&r->async_channel, src_node);
+			_starpu_driver_wait_request_completion(&r->async_channel);
 			starpu_handle_data_request_completion(r);
 		}
 		else {
-			if (_starpu_driver_test_request_completion(&r->async_channel, src_node))
+			if (_starpu_driver_test_request_completion(&r->async_channel))
 			{
 				/* The request was completed */
 				starpu_handle_data_request_completion(r);

+ 2 - 6
src/datawizard/data_request.h

@@ -44,7 +44,7 @@ LIST_TYPE(starpu_data_request,
 
 	starpu_access_mode mode;
 
-	starpu_async_channel async_channel;
+	struct starpu_async_channel async_channel;
 
 	unsigned completed;
 	int retval;
@@ -60,8 +60,6 @@ LIST_TYPE(starpu_data_request,
 
 	struct callback_list *callbacks;
 
-	unsigned is_a_prefetch_request;
-
 #ifdef STARPU_USE_FXT
 	unsigned com_id;
 #endif
@@ -102,10 +100,8 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 				struct starpu_data_replicate_s *dst_replicate,
 				uint32_t handling_node,
 				starpu_access_mode mode,
-				unsigned ndeps,
-				unsigned is_prefetch);
+				unsigned ndeps);
 
-starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, starpu_access_mode mode);
 int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc);
 
 void _starpu_data_request_append_callback(starpu_data_request_t r,

+ 40 - 12
src/datawizard/filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -41,11 +41,9 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 		}
 	}
 }
-void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
+void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters, va_list pa)
 {
 	unsigned i;
-	va_list pa;
-	va_start(pa, nfilters);
 	for (i = 0; i < nfilters; i++)
 	{
 		struct starpu_data_filter *next_filter;
@@ -55,6 +53,13 @@ void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters,
 
 		map_filter(root_handle, next_filter);
 	}
+}
+
+void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
+{
+	va_list pa;
+	va_start(pa, nfilters);
+	starpu_data_vmap_filters(root_handle, nfilters, pa);
 	va_end(pa);
 }
 
@@ -75,22 +80,30 @@ starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
  */
 starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_handle, unsigned depth, ... )
 {
+	va_list pa;
+	va_start(pa, depth);
+	starpu_data_handle handle = starpu_data_vget_sub_data(root_handle, depth, pa);
+	va_end(pa);
+
+	return handle;
+}
+
+starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, unsigned depth, va_list pa )
+{
 	STARPU_ASSERT(root_handle);
 	starpu_data_handle current_handle = root_handle;
 
 	/* the variable number of argument must correlate the depth in the tree */
 	unsigned i; 
-	va_list pa;
-	va_start(pa, depth);
 	for (i = 0; i < depth; i++)
 	{
 		unsigned next_child;
 		next_child = va_arg(pa, unsigned);
+
 		STARPU_ASSERT(next_child < current_handle->nchildren);
 
 		current_handle = &current_handle->children[next_child];
 	}
-	va_end(pa);
 
 	return current_handle;
 }
@@ -142,6 +155,7 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		child->req_list = starpu_data_requester_list_new();
 		child->reduction_req_list = starpu_data_requester_list_new();
 		child->refcnt = 0;
+		child->reduction_refcnt = 0;
 		_starpu_spin_init(&child->header_lock);
 
 		child->sequential_consistency = initial_handle->sequential_consistency;
@@ -158,9 +172,6 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		child->redux_cl = initial_handle->redux_cl;
 		child->init_cl = initial_handle->init_cl;
 
-		child->reduction_refcnt = 0;
-		child->reduction_req_list = starpu_data_requester_list_new();
-
 #ifdef STARPU_USE_FXT
 		child->last_submitted_ghost_writer_id_is_valid = 0;
 		child->last_submitted_ghost_writer_id = 0;
@@ -201,8 +212,13 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 			child_replicate->automatically_allocated = 0;
 			child_replicate->refcnt = 0;
 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
-			child_replicate->requested = 0;
-			child_replicate->request = NULL;
+
+			for (node = 0; node < STARPU_MAXNODES; node++)
+			{
+				child_replicate->requested[node] = 0;
+				child_replicate->request[node] = NULL;
+			}
+
 			child_replicate->relaxed_coherency = 1;
 			child_replicate->initialized = 0;
 
@@ -214,6 +230,13 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 		 * store it in the handle */
 		child->data_size = child->ops->get_size(child);
 		child->footprint = _starpu_compute_data_footprint(child);
+
+		void *ptr;
+		ptr = starpu_handle_to_pointer(child, 0);
+		if (ptr != NULL)
+		{
+			_starpu_data_register_ram_pointer(child, ptr);
+		}
 	}
 	/* now let the header */
 	_starpu_spin_unlock(&initial_handle->header_lock);
@@ -244,6 +267,8 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 		STARPU_ASSERT(ret == 0); 
 
 		_starpu_data_free_interfaces(&root_handle->children[child]);
+		starpu_data_requester_list_delete(child_handle->req_list);
+		starpu_data_requester_list_delete(child_handle->reduction_req_list);
 	}
 
 	/* the gathering_node should now have a valid copy of all the children.
@@ -279,7 +304,9 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 				_starpu_request_mem_chunk_removal(root_handle, node);
 				isvalid = 0; 
 			}
+#ifdef STARPU_DEVEL
 #warning free the data replicate if needed
+#endif
 
 		}
 
@@ -300,6 +327,7 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 	}
 
 	/* there is no child anymore */
+	//free(root_handle->children);
 	root_handle->nchildren = 0;
 
 	/* now the parent may be used again so we release the lock */

+ 6 - 1
src/datawizard/footprint.c

@@ -18,8 +18,11 @@
 #include <datawizard/footprint.h>
 #include <common/hash.h>
 
-void _starpu_compute_buffers_footprint(starpu_job_t j)
+uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
 {
+	if (j->footprint_is_computed)
+		return j->footprint;
+
 	uint32_t footprint = 0;
 	unsigned buffer;
 
@@ -36,6 +39,8 @@ void _starpu_compute_buffers_footprint(starpu_job_t j)
 
 	j->footprint = footprint;
 	j->footprint_is_computed = 1;
+
+	return footprint;
 }
 
 inline uint32_t _starpu_compute_data_footprint(starpu_data_handle handle)

+ 1 - 1
src/datawizard/footprint.h

@@ -24,7 +24,7 @@
 
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
-void _starpu_compute_buffers_footprint(struct starpu_job_s *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_job_s *j);
 
 /* Compute the footprint that characterizes the layout of the data handle. */
 uint32_t _starpu_compute_data_footprint(starpu_data_handle handle);

+ 2 - 2
src/datawizard/interfaces/bcsr_filters.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 #include <common/config.h>
 #include <datawizard/filters.h>
 
-void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nparts)
+void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
 {
 	struct starpu_bcsr_interface_s *bcsr_father = father_interface;
 	/* each chunk becomes a small dense matrix */

+ 55 - 55
src/datawizard/interfaces/bcsr_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,15 +31,15 @@
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 
 static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
@@ -60,11 +60,11 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_bcsr_buffer_on_node(void *interface, uint32_t dst_node);
-static void free_bcsr_buffer_on_node(void *interface, uint32_t node);
+static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_bcsr_buffer_on_node(void *data_interface, uint32_t dst_node);
+static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node);
 static size_t bcsr_interface_get_size(starpu_data_handle handle);
-static int bcsr_compare(void *interface_a, void *interface_b);
+static int bcsr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle);
 
 
@@ -80,9 +80,9 @@ static struct starpu_data_interface_ops_t interface_bcsr_ops = {
 	.compare = bcsr_compare
 };
 
-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
 {
-	starpu_bcsr_interface_t *bcsr_interface = interface;
+	starpu_bcsr_interface_t *bcsr_interface = data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -115,7 +115,7 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 		uint32_t *rowptr, uint32_t firstentry,
 		uint32_t r, uint32_t c, size_t elemsize)
 {
-	starpu_bcsr_interface_t interface = {
+	starpu_bcsr_interface_t bcsr_interface = {
 		.nzval = nzval,
 		.colind = colind,
 		.rowptr = rowptr,
@@ -127,7 +127,7 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 		.elemsize = elemsize
 	};
 
-	starpu_data_register(handleptr, home_node, &interface, &interface_bcsr_ops);
+	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
 }
 
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
@@ -141,10 +141,10 @@ static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
 	return hash;
 }
 
-static int bcsr_compare(void *interface_a, void *interface_b)
+static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_bcsr_interface_t *bcsr_a = interface_a;
-	starpu_bcsr_interface_t *bcsr_b = interface_b;
+	starpu_bcsr_interface_t *bcsr_a = data_interface_a;
+	starpu_bcsr_interface_t *bcsr_b = data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((bcsr_a->nnz == bcsr_b->nnz)
@@ -157,50 +157,50 @@ static int bcsr_compare(void *interface_a, void *interface_b)
 /* offer an access to the data parameters */
 uint32_t starpu_bcsr_get_nnz(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nnz;
+	return data_interface->nnz;
 }
 
 uint32_t starpu_bcsr_get_nrow(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nrow;
+	return data_interface->nrow;
 }
 
 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->firstentry;
+	return data_interface->firstentry;
 }
 
 uint32_t starpu_bcsr_get_r(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->r;
+	return data_interface->r;
 }
 
 uint32_t starpu_bcsr_get_c(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->c;
+	return data_interface->c;
 }
 
 size_t starpu_bcsr_get_elemsize(starpu_data_handle handle)
 {
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return data_interface->elemsize;
 }
 
 uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
@@ -210,28 +210,28 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, node);
 	
-	return interface->nzval;
+	return data_interface->nzval;
 }
 
 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle handle)
 {
 	/* XXX 0 */
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->colind;
+	return data_interface->colind;
 }
 
 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle handle)
 {
 	/* XXX 0 */
-	starpu_bcsr_interface_t *interface =
+	starpu_bcsr_interface_t *data_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->rowptr;
+	return data_interface->rowptr;
 }
 
 
@@ -254,21 +254,21 @@ static size_t bcsr_interface_get_size(starpu_data_handle handle)
 /* memory allocation/deallocation primitives for the BLAS interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_bcsr_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
 	uintptr_t addr_nzval;
 	uint32_t *addr_colind, *addr_rowptr;
 	ssize_t allocated_memory;
 
 	/* we need the 3 arrays to be allocated */
-	starpu_bcsr_interface_t *interface = interface_;
+	starpu_bcsr_interface_t *bcsr_interface = data_interface_;
 
-	uint32_t nnz = interface->nnz;
-	uint32_t nrow = interface->nrow;
-	size_t elemsize = interface->elemsize;
+	uint32_t nnz = bcsr_interface->nnz;
+	uint32_t nrow = bcsr_interface->nrow;
+	size_t elemsize = bcsr_interface->elemsize;
 
-	uint32_t r = interface->r;
-	uint32_t c = interface->c;
+	uint32_t r = bcsr_interface->r;
+	uint32_t c = bcsr_interface->c;
 
 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
 
@@ -333,9 +333,9 @@ static ssize_t allocate_bcsr_buffer_on_node(void *interface_, uint32_t dst_node)
 		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
 
 	/* update the data properly in consequence */
-	interface->nzval = addr_nzval;
-	interface->colind = addr_colind;
-	interface->rowptr = addr_rowptr;
+	bcsr_interface->nzval = addr_nzval;
+	bcsr_interface->colind = addr_colind;
+	bcsr_interface->rowptr = addr_rowptr;
 	
 	return allocated_memory;
 
@@ -381,9 +381,9 @@ fail_nzval:
 	return -ENOMEM;
 }
 
-static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
+static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_bcsr_interface_t *bcsr_interface = interface;	
+	starpu_bcsr_interface_t *bcsr_interface = data_interface;
 
 	starpu_node_kind kind = _starpu_get_node_kind(node);
 	switch(kind) {
@@ -412,7 +412,7 @@ static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
 	starpu_bcsr_interface_t *src_bcsr = src_interface;
 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
@@ -443,24 +443,24 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_bcsr_interface_t *src_bcsr = src_interface;
 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
@@ -491,7 +491,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 	return 0;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_bcsr_interface_t *src_bcsr = src_interface;
 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
@@ -524,7 +524,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute
 #endif // STARPU_USE_OPENCL
 
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_bcsr_interface_t *src_bcsr = src_interface;
 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;

+ 2 - 2
src/datawizard/interfaces/block_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,7 @@
 #include <common/config.h>
 #include <datawizard/filters.h>
 
-void starpu_block_filter_func_block(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f,
+void starpu_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
                                     unsigned id, unsigned nparts)
 {
         starpu_block_interface_t *block_father = father_interface;

+ 72 - 60
src/datawizard/interfaces/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,19 +27,19 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
 static const struct starpu_data_copy_methods block_copy_data_methods_s = {
@@ -65,20 +65,22 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 };
 
 
-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_block_buffer_on_node(void *interface, uint32_t node);
+static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void *block_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void free_block_buffer_on_node(void *data_interface, uint32_t node);
 static size_t block_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
-static int block_compare(void *interface_a, void *interface_b);
+static int block_compare(void *data_interface_a, void *data_interface_b);
 static void display_block_interface(starpu_data_handle handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss);
+static int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
 static struct starpu_data_interface_ops_t interface_block_ops = {
 	.register_data_handle = register_block_handle,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
+	.handle_to_pointer = block_handle_to_pointer,
 	.free_data_on_node = free_block_buffer_on_node,
 	.copy_methods = &block_copy_data_methods_s,
 	.get_size = block_interface_get_size,
@@ -93,7 +95,7 @@ static struct starpu_data_interface_ops_t interface_block_ops = {
 };
 
 #ifdef STARPU_USE_GORDON
-int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	/* TODO */
 	STARPU_ABORT();
@@ -102,9 +104,19 @@ int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t
 }
 #endif
 
-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void *block_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 {
-	starpu_block_interface_t *block_interface = interface;
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	starpu_block_interface_t *block_interface =
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) block_interface->ptr;
+}
+
+static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+{
+	starpu_block_interface_t *block_interface = data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -139,7 +151,7 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 			uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
 			uint32_t ny, uint32_t nz, size_t elemsize)
 {
-	starpu_block_interface_t interface = {
+	starpu_block_interface_t block_interface = {
 		.ptr = ptr,
                 .dev_handle = ptr,
                 .offset = 0,
@@ -151,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 		.elemsize = elemsize
 	};
 
-	starpu_data_register(handleptr, home_node, &interface, &interface_block_ops);
+	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
 }
 
 static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
@@ -165,10 +177,10 @@ static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
 	return hash;
 }
 
-static int block_compare(void *interface_a, void *interface_b)
+static int block_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_block_interface_t *block_a = interface_a;
-	starpu_block_interface_t *block_b = interface_b;
+	starpu_block_interface_t *block_a = data_interface_a;
+	starpu_block_interface_t *block_b = data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((block_a->nx == block_b->nx)
@@ -179,21 +191,21 @@ static int block_compare(void *interface_a, void *interface_b)
 
 static void display_block_interface(starpu_data_handle handle, FILE *f)
 {
-	starpu_block_interface_t *interface;
+	starpu_block_interface_t *block_interface;
 
-	interface = starpu_data_get_interface_on_node(handle, 0);
+	block_interface = starpu_data_get_interface_on_node(handle, 0);
 
-	fprintf(f, "%u\t%u\t%u\t", interface->nx, interface->ny, interface->nz);
+	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 }
 
 static size_t block_interface_get_size(starpu_data_handle handle)
 {
 	size_t size;
-	starpu_block_interface_t *interface;
+	starpu_block_interface_t *block_interface;
 
-	interface = starpu_data_get_interface_on_node(handle, 0);
+	block_interface = starpu_data_get_interface_on_node(handle, 0);
 
-	size = interface->nx*interface->ny*interface->nz*interface->elemsize; 
+	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize; 
 
 	return size;
 }
@@ -201,26 +213,26 @@ static size_t block_interface_get_size(starpu_data_handle handle)
 /* offer an access to the data parameters */
 uint32_t starpu_block_get_nx(starpu_data_handle handle)
 {
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nx;
+	return block_interface->nx;
 }
 
 uint32_t starpu_block_get_ny(starpu_data_handle handle)
 {
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->ny;
+	return block_interface->ny;
 }
 
 uint32_t starpu_block_get_nz(starpu_data_handle handle)
 {
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nz;
+	return block_interface->nz;
 }
 
 uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
@@ -230,10 +242,10 @@ uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 	
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ldy;
+	return block_interface->ldy;
 }
 
 uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
@@ -243,10 +255,10 @@ uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ldz;
+	return block_interface->ldz;
 }
 
 uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
@@ -256,25 +268,25 @@ uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ptr;
+	return block_interface->ptr;
 }
 
 size_t starpu_block_get_elemsize(starpu_data_handle handle)
 {
-	starpu_block_interface_t *interface =
+	starpu_block_interface_t *block_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return block_interface->elemsize;
 }
 
 
 /* memory allocation/deallocation primitives for the BLOCK interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
 	uintptr_t addr = 0;
 	unsigned fail = 0;
@@ -283,7 +295,7 @@ static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
-	starpu_block_interface_t *dst_block = interface_;
+	starpu_block_interface_t *dst_block = data_interface_;
 
 	uint32_t nx = dst_block->nx;
 	uint32_t ny = dst_block->ny;
@@ -350,9 +362,9 @@ static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node
 	return allocated_memory;
 }
 
-static void free_block_buffer_on_node(void *interface, uint32_t node)
+static void free_block_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_block_interface_t *block_interface = interface;
+	starpu_block_interface_t *block_interface = data_interface;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
@@ -382,7 +394,7 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -435,7 +447,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -547,29 +559,29 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_in
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -636,7 +648,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -695,12 +707,12 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
@@ -708,7 +720,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 #endif
 
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;

+ 2 - 2
src/datawizard/interfaces/csr_filters.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 #include <common/config.h>
 #include <datawizard/filters.h>
 
-void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
 	starpu_csr_interface_t *csr_father = father_interface;
 	starpu_csr_interface_t *csr_child = child_interface;

+ 240 - 50
src/datawizard/interfaces/csr_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,15 +28,18 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #endif
 
 static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
@@ -46,6 +49,9 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -57,11 +63,11 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_csr_buffer_on_node(void *interface, uint32_t node);
+static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
 static size_t csr_interface_get_size(starpu_data_handle handle);
-static int csr_compare(void *interface_a, void *interface_b);
+static int csr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
 
 static struct starpu_data_interface_ops_t interface_csr_ops = {
@@ -76,9 +82,9 @@ static struct starpu_data_interface_ops_t interface_csr_ops = {
 	.compare = csr_compare
 };
 
-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
 {
-	starpu_csr_interface_t *csr_interface = interface;
+	starpu_csr_interface_t *csr_interface = data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -108,7 +114,7 @@ static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, v
 void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
 {
-	starpu_csr_interface_t interface = {
+	starpu_csr_interface_t csr_interface = {
 		.nnz = nnz,
 		.nrow = nrow,
 		.nzval = nzval,
@@ -118,7 +124,7 @@ void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 		.elemsize = elemsize
 	};
 
-	starpu_data_register(handleptr, home_node, &interface, &interface_csr_ops);
+	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
 }
 
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
@@ -126,10 +132,10 @@ static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
 	return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
 }
 
-static int csr_compare(void *interface_a, void *interface_b)
+static int csr_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_csr_interface_t *csr_a = interface_a;
-	starpu_csr_interface_t *csr_b = interface_b;
+	starpu_csr_interface_t *csr_a = data_interface_a;
+	starpu_csr_interface_t *csr_b = data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((csr_a->nnz == csr_b->nnz)
@@ -140,34 +146,34 @@ static int csr_compare(void *interface_a, void *interface_b)
 /* offer an access to the data parameters */
 uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
 {
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nnz;
+	return csr_interface->nnz;
 }
 
 uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
 {
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nrow;
+	return csr_interface->nrow;
 }
 
 uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
 {
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->firstentry;
+	return csr_interface->firstentry;
 }
 
 size_t starpu_csr_get_elemsize(starpu_data_handle handle)
 {
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return csr_interface->elemsize;
 }
 
 uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
@@ -177,10 +183,10 @@ uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->nzval;
+	return csr_interface->nzval;
 }
 
 uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
@@ -190,10 +196,10 @@ uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->colind;
+	return csr_interface->colind;
 }
 
 uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
@@ -203,10 +209,10 @@ uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_csr_interface_t *interface =
+	starpu_csr_interface_t *csr_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->rowptr;
+	return csr_interface->rowptr;
 }
 
 static size_t csr_interface_get_size(starpu_data_handle handle)
@@ -225,18 +231,18 @@ static size_t csr_interface_get_size(starpu_data_handle handle)
 /* memory allocation/deallocation primitives for the BLAS interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
 	uintptr_t addr_nzval;
 	uint32_t *addr_colind, *addr_rowptr;
 	ssize_t allocated_memory;
 
 	/* we need the 3 arrays to be allocated */
-	starpu_csr_interface_t *interface = interface_;
+	starpu_csr_interface_t *csr_interface = data_interface_;
 
-	uint32_t nnz = interface->nnz;
-	uint32_t nrow = interface->nrow;
-	size_t elemsize = interface->elemsize;
+	uint32_t nnz = csr_interface->nnz;
+	uint32_t nrow = csr_interface->nrow;
+	size_t elemsize = csr_interface->elemsize;
 
 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
 
@@ -301,9 +307,9 @@ static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node)
 		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
 
 	/* update the data properly in consequence */
-	interface->nzval = addr_nzval;
-	interface->colind = addr_colind;
-	interface->rowptr = addr_rowptr;
+	csr_interface->nzval = addr_nzval;
+	csr_interface->colind = addr_colind;
+	csr_interface->rowptr = addr_rowptr;
 	
 	return allocated_memory;
 
@@ -349,9 +355,9 @@ fail_nzval:
 	return -ENOMEM;
 }
 
-static void free_csr_buffer_on_node(void *interface, uint32_t node)
+static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_csr_interface_t *csr_interface = interface;	
+	starpu_csr_interface_t *csr_interface = data_interface;
 
 	starpu_node_kind kind = _starpu_get_node_kind(node);
 	switch(kind) {
@@ -380,7 +386,7 @@ static void free_csr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
 	starpu_csr_interface_t *src_csr = src_interface;
 	starpu_csr_interface_t *dst_csr = dst_interface;
@@ -408,24 +414,208 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cudaError_t cures;
+
+	int synchronous_fallback = 0;
+
+	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
+	if (cures)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	
+	if (synchronous_fallback)
+	{
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		return 0;
+	}
+	else {
+		return -EAGAIN;
+	}
+}
+
+static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	cudaError_t cures;
+
+	cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+#else
+	STARPU_ABORT();
+	return 0;
+#endif
+}
+
+static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
+{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cudaError_t cures;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	int synchronous_fallback = 0;
+
+	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
+	if (cures)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	
+	if (synchronous_fallback)
+	{
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		return 0;
+	}
+	else {
+		return -EAGAIN;
+	}
+#else
+	/* Illegal without Peer tranfers */
+	STARPU_ABORT();
+	return 0;
+#endif
+}
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	else
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
+}
+
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
+}
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+		return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
+	else
+		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
 }
+
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_csr_interface_t *src_csr = src_interface;
 	starpu_csr_interface_t *dst_csr = dst_interface;
@@ -453,7 +643,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 	return 0;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_csr_interface_t *src_csr = src_interface;
 	starpu_csr_interface_t *dst_csr = dst_interface;
@@ -483,7 +673,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute
 #endif // STARPU_USE_OPENCL
 
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_csr_interface_t *src_csr = src_interface;
 	starpu_csr_interface_t *dst_csr = dst_interface;

+ 140 - 5
src/datawizard/interfaces/data_interface.c

@@ -15,8 +15,79 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <stdint.h>
+
 #include <datawizard/datawizard.h>
 #include <core/dependencies/data_concurrency.h>
+#include <common/uthash.h>
+#include <common/starpu_spinlock.h>
+
+/* Entry in the `registered_handles' hash table.  */
+struct handle_entry
+{
+	UT_hash_handle hh;
+	void *pointer;
+	starpu_data_handle handle;
+};
+
+/* Hash table mapping host pointers to data handles.  */
+static struct handle_entry *registered_handles;
+static starpu_spinlock_t    registered_handles_lock;
+
+void _starpu_data_interface_init()
+{
+	_starpu_spin_init(&registered_handles_lock);
+}
+
+void _starpu_data_interface_shutdown()
+{
+	struct handle_entry *entry, *tmp;
+
+	_starpu_spin_destroy(&registered_handles_lock);
+
+	HASH_ITER(hh, registered_handles, entry, tmp) {
+		HASH_DEL(registered_handles, entry);
+		free(entry);
+	}
+
+	registered_handles = NULL;
+}
+
+/* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
+ * some handle, the new mapping shadows the previous one.   */
+void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
+{
+	struct handle_entry *entry;
+
+	entry = malloc(sizeof(*entry));
+	STARPU_ASSERT(entry != NULL);
+
+	entry->pointer = ptr;
+	entry->handle = handle;
+
+	_starpu_spin_lock(&registered_handles_lock);
+	HASH_ADD_PTR(registered_handles, pointer, entry);
+	_starpu_spin_unlock(&registered_handles_lock);
+}
+
+starpu_data_handle starpu_data_lookup(const void *ptr)
+{
+	starpu_data_handle result;
+
+	_starpu_spin_lock(&registered_handles_lock);
+	{
+		struct handle_entry *entry;
+
+		HASH_FIND_PTR(registered_handles, &ptr, entry);
+		if(STARPU_UNLIKELY(entry == NULL))
+			result = NULL;
+		else
+			result = entry->handle;
+	}
+	_starpu_spin_unlock(&registered_handles_lock);
+
+	return result;
+}
 
 /* 
  * Start monitoring a piece of data
@@ -25,6 +96,8 @@
 static void _starpu_register_new_data(starpu_data_handle handle,
 					uint32_t home_node, uint32_t wt_mask)
 {
+	void *ptr;
+
 	STARPU_ASSERT(handle);
 
 	/* initialize the new lock */
@@ -42,6 +115,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 	handle->sibling_index = 0; /* could be anything for the root */
 	handle->depth = 1; /* the tree is just a node yet */
         handle->rank = -1; /* invalid until set */
+	handle->tag = -1; /* invalid until set */
 
 	handle->is_not_important = 0;
 
@@ -113,18 +187,29 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 		replicate->state = STARPU_INVALID;
 		replicate->refcnt = 0;
 		replicate->handle = handle;
-		replicate->requested = 0;
-		replicate->request = NULL;
+
+		for (node = 0; node < STARPU_MAXNODES; node++)
+		{
+			replicate->requested[node] = 0;
+			replicate->request[node] = NULL;
+		}
+
 		replicate->relaxed_coherency = 1;
 		replicate->initialized = 0;
 		replicate->memory_node = starpu_worker_get_memory_node(worker);
 
 		/* duplicate  the content of the interface on node 0 */
 		memcpy(replicate->data_interface, handle->per_node[0].data_interface, handle->ops->interface_size);
-	} 
+	}
 
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
+
+	ptr = starpu_handle_to_pointer(handle, 0);
+	if (ptr != NULL)
+	{
+		_starpu_data_register_ram_pointer(handle, ptr);
+	}
 }
 
 static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interface_ops_t *interface_ops)
@@ -169,7 +254,7 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 }
 
 void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
-				void *interface,
+				void *data_interface,
 				struct starpu_data_interface_ops_t *ops)
 {
 	starpu_data_handle handle =
@@ -180,11 +265,30 @@ void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 
 
 	/* fill the interface fields with the appropriate method */
-	ops->register_data_handle(handle, home_node, interface);
+	ops->register_data_handle(handle, home_node, data_interface);
 
 	_starpu_register_new_data(handle, home_node, 0);
 }
 
+void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+{
+	/* Check whether the operation is supported and the node has actually
+	 * been allocated.  */
+	if (handle->ops->handle_to_pointer
+	    && starpu_data_test_if_allocated_on_node(handle, node))
+	{
+		return handle->ops->handle_to_pointer(handle, node);
+	}
+
+	return NULL;
+}
+
+void *starpu_handle_get_local_ptr(starpu_data_handle handle)
+{
+	return starpu_handle_to_pointer(handle,
+					_starpu_get_local_memory_node());
+}
+
 int starpu_data_get_rank(starpu_data_handle handle)
 {
 	return handle->rank;
@@ -196,21 +300,52 @@ int starpu_data_set_rank(starpu_data_handle handle, int rank)
         return 0;
 }
 
+int starpu_data_get_tag(starpu_data_handle handle)
+{
+	return handle->tag;
+}
+
+int starpu_data_set_tag(starpu_data_handle handle, int tag)
+{
+        handle->tag = tag;
+        return 0;
+}
+
 /* 
  * Stop monitoring a piece of data
  */
 
 void _starpu_data_free_interfaces(starpu_data_handle handle)
 {
+	const void *ram_ptr;
 	unsigned node;
 	unsigned worker;
 	unsigned nworkers = starpu_worker_get_count();
 
+	ram_ptr = starpu_handle_to_pointer(handle, 0);
+
 	for (node = 0; node < STARPU_MAXNODES; node++)
 		free(handle->per_node[node].data_interface);
 
 	for (worker = 0; worker < nworkers; worker++)
 		free(handle->per_worker[worker].data_interface);
+
+	if (ram_ptr != NULL)
+	{
+		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
+		 * to another handle existed before (e.g., when using
+		 * filters), it becomes visible again.  */
+		struct handle_entry *entry;
+
+		_starpu_spin_lock(&registered_handles_lock);
+		HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
+		STARPU_ASSERT(entry != NULL);
+
+		HASH_DEL(registered_handles, entry);
+		free(entry);
+
+		_starpu_spin_unlock(&registered_handles_lock);
+	}
 }
 
 struct unregister_callback_arg {

+ 10 - 2
src/datawizard/interfaces/data_interface.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,6 +23,14 @@
 
 /* Some data interfaces or filters use this interface internally */
 extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
-void _starpu_data_free_interfaces(starpu_data_handle handle);
+void _starpu_data_free_interfaces(starpu_data_handle handle)
+	STARPU_ATTRIBUTE_INTERNAL;
+
+extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
+extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
+
+extern void _starpu_data_register_ram_pointer(starpu_data_handle handle,
+						void *ptr)
+	STARPU_ATTRIBUTE_INTERNAL;
 
 #endif // __DATA_INTERFACE_H__

+ 3 - 3
src/datawizard/interfaces/matrix_filters.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +23,7 @@
 /*
  * an example of a dummy partition function : blocks ...
  */
-void starpu_block_filter_func(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+void starpu_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
        starpu_matrix_interface_t *matrix_father = father_interface;
        starpu_matrix_interface_t *matrix_child = child_interface;
@@ -54,7 +54,7 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, __a
 	}
 }
 
-void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
         starpu_matrix_interface_t *matrix_father = father_interface;
         starpu_matrix_interface_t *matrix_child = child_interface;

+ 201 - 110
src/datawizard/interfaces/matrix_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,19 +25,20 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
@@ -49,6 +50,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -62,20 +64,22 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_matrix_buffer_on_node(void *interface, uint32_t node);
+static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static void *matrix_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void free_matrix_buffer_on_node(void *data_interface, uint32_t node);
 static size_t matrix_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle);
-static int matrix_compare(void *interface_a, void *interface_b);
+static int matrix_compare(void *data_interface_a, void *data_interface_b);
 static void display_matrix_interface(starpu_data_handle handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 
 struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
 	.register_data_handle = register_matrix_handle,
 	.allocate_data_on_node = allocate_matrix_buffer_on_node,
+	.handle_to_pointer = matrix_handle_to_pointer,
 	.free_data_on_node = free_matrix_buffer_on_node,
 	.copy_methods = &matrix_copy_data_methods_s,
 	.get_size = matrix_interface_get_size,
@@ -90,7 +94,7 @@ struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
 };
 
 #ifdef STARPU_USE_GORDON
-static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	size_t elemsize = GET_MATRIX_ELEMSIZE(interface);
 	uint32_t nx = STARPU_MATRIX_GET_NX(interface);
@@ -107,9 +111,9 @@ static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strid
 }
 #endif
 
-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
 {
-	starpu_matrix_interface_t *matrix_interface = interface;
+	starpu_matrix_interface_t *matrix_interface = data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -136,12 +140,23 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 	}
 }
 
+static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	starpu_matrix_interface_t *matrix_interface =
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) matrix_interface->ptr;
+}
+
+
 /* declare a new data with the matrix interface */
 void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 			uintptr_t ptr, uint32_t ld, uint32_t nx,
 			uint32_t ny, size_t elemsize)
 {
-	starpu_matrix_interface_t interface = {
+	starpu_matrix_interface_t matrix_interface = {
 		.ptr = ptr,
 		.ld = ld,
 		.nx = nx,
@@ -151,7 +166,7 @@ void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_no
                 .offset = 0
 	};
 
-	starpu_data_register(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
+	starpu_data_register(handleptr, home_node, &matrix_interface, &_starpu_interface_matrix_ops);
 }
 
 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
@@ -159,10 +174,10 @@ static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
 	return _starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
 }
 
-static int matrix_compare(void *interface_a, void *interface_b)
+static int matrix_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_matrix_interface_t *matrix_a = interface_a;
-	starpu_matrix_interface_t *matrix_b = interface_b;
+	starpu_matrix_interface_t *matrix_a = data_interface_a;
+	starpu_matrix_interface_t *matrix_b = data_interface_b;
 
 	/* Two matricess are considered compatible if they have the same size */
 	return ((matrix_a->nx == matrix_b->nx)
@@ -172,19 +187,19 @@ static int matrix_compare(void *interface_a, void *interface_b)
 
 static void display_matrix_interface(starpu_data_handle handle, FILE *f)
 {
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	fprintf(f, "%u\t%u\t", interface->nx, interface->ny);
+	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 }
 
 static size_t matrix_interface_get_size(starpu_data_handle handle)
 {
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
 	size_t size;
-	size = (size_t)interface->nx*interface->ny*interface->elemsize; 
+	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize; 
 
 	return size;
 }
@@ -192,18 +207,18 @@ static size_t matrix_interface_get_size(starpu_data_handle handle)
 /* offer an access to the data parameters */
 uint32_t starpu_matrix_get_nx(starpu_data_handle handle)
 {
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nx;
+	return matrix_interface->nx;
 }
 
 uint32_t starpu_matrix_get_ny(starpu_data_handle handle)
 {
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->ny;
+	return matrix_interface->ny;
 }
 
 uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
@@ -213,10 +228,10 @@ uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ld;
+	return matrix_interface->ld;
 }
 
 uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
@@ -226,24 +241,24 @@ uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ptr;
+	return matrix_interface->ptr;
 }
 
 size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
 {
-	starpu_matrix_interface_t *interface =
+	starpu_matrix_interface_t *matrix_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return matrix_interface->elemsize;
 }
 
 /* memory allocation/deallocation primitives for the matrix interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
 	uintptr_t addr = 0;
 	unsigned fail = 0;
@@ -253,12 +268,12 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 	cudaError_t status;
 #endif
 
-	starpu_matrix_interface_t *interface = interface_;
+	starpu_matrix_interface_t *matrix_interface = data_interface_;
 
-	uint32_t nx = interface->nx;
-	uint32_t ny = interface->ny;
+	uint32_t nx = matrix_interface->nx;
+	uint32_t ny = matrix_interface->ny;
 	uint32_t ld = nx; // by default
-	size_t elemsize = interface->elemsize;
+	size_t elemsize = matrix_interface->elemsize;
 
 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
 
@@ -306,10 +321,10 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 		allocated_memory = (size_t)nx*ny*elemsize;
 
 		/* update the data properly in consequence */
-		interface->ptr = addr;
-                interface->dev_handle = addr;
-                interface->offset = 0;
-		interface->ld = ld;
+		matrix_interface->ptr = addr;
+                matrix_interface->dev_handle = addr;
+                matrix_interface->offset = 0;
+		matrix_interface->ld = ld;
 	} else {
 		/* allocation failed */
 		allocated_memory = -ENOMEM;
@@ -318,9 +333,9 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 	return allocated_memory;
 }
 
-static void free_matrix_buffer_on_node(void *interface, uint32_t node)
+static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_matrix_interface_t *matrix_interface = interface;
+	starpu_matrix_interface_t *matrix_interface = data_interface;
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t status;
@@ -350,17 +365,48 @@ static void free_matrix_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
-
 	cudaError_t cures;
-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
+
+#if 0
+
+	struct cudaMemcpy3DParms p;
+	memset(&p, 0, sizeof(p));
+
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
+	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+	p.kind = kind;
+
+	if (is_async)
+	{
+		cures = cudaMemcpy3DAsync(&p, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpy3D(&p);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+#endif
+
+	if (is_async)
+	{
+		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-			src_matrix->nx*elemsize, src_matrix->ny, kind);
+			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
+		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
+		src_matrix->nx*elemsize, src_matrix->ny, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -369,85 +415,130 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+/* XXX this is broken : we need to find a way to fix that ! */
+#if 0
+static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
+	cudaError_t cures;
 
-	cudaError_t cures;	
-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
-			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
-			cudaMemcpyDeviceToHost, stream);
-	if (cures)
-	{
-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
-			(size_t)src_matrix->nx*elemsize, (size_t)src_matrix->ny,
-			cudaMemcpyDeviceToHost);
+#if 1
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
 
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
+	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
-		return 0;
+	cures = cudaSetDevice(src_dev);
+	STARPU_ASSERT(cures == cudaSuccess);
+
+	struct cudaPitchedPtr mem_device1;
+	cures = cudaMalloc3D(&mem_device1, extent);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaSetDevice(dst_dev);
+	STARPU_ASSERT(cures == cudaSuccess);
+
+	struct cudaPitchedPtr mem_device2;
+	cures = cudaMalloc3D(&mem_device2, extent);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	struct cudaMemcpy3DPeerParms p;
+	memset(&p, 0, sizeof(p));
+	p.srcDevice = src_dev;
+	p.dstDevice = dst_dev;
+	p.srcPtr = mem_device1;
+	p.dstPtr = mem_device2;
+	p.extent = extent;
+
+	cures = cudaMemcpy3DPeer(&p);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+
+//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
+//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
+//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+
+//	if (is_async)
+//	{
+//		cures = cudaMemcpy3DPeerAsync(&p, stream);
+//		if (!cures)
+//			return -EAGAIN;
+//	}
+
+#else
+	/* XXX FIXME !!*/
+	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
+	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
+
+	if (is_async)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
+		if (!cures)
+			return -EAGAIN;
 	}
 
+	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+#endif
+
 	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
-	return -EAGAIN;
+	return 0;
 }
+#endif
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
+}
 
-	size_t elemsize = src_matrix->elemsize;
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
+}
 
-	cudaError_t cures;
-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-				src_matrix->nx*elemsize, src_matrix->ny,
-				cudaMemcpyHostToDevice, stream);
-	if (cures)
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
+	else
 	{
-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-				src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
+		/* XXX not implemented */
+		STARPU_ABORT();
 		return 0;
 	}
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
+}
 
-	return -EAGAIN;
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
+#if 0
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
+	else
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
+}
+#endif
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -467,7 +558,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -488,12 +579,12 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
@@ -501,7 +592,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 #endif
 
 /* as not all platform easily have a  lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;

+ 136 - 60
src/datawizard/interfaces/variable_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,20 +25,21 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
 static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
@@ -47,9 +48,10 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -64,20 +66,22 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_variable_buffer_on_node(void *interface, uint32_t node);
+static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void *variable_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void free_variable_buffer_on_node(void *data_interface, uint32_t node);
 static size_t variable_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle);
-static int variable_compare(void *interface_a, void *interface_b);
+static int variable_compare(void *data_interface_a, void *data_interface_b);
 static void display_variable_interface(starpu_data_handle handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_variable_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 
 static struct starpu_data_interface_ops_t interface_variable_ops = {
 	.register_data_handle = register_variable_handle,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
+	.handle_to_pointer = variable_handle_to_pointer,
 	.free_data_on_node = free_variable_buffer_on_node,
 	.copy_methods = &variable_copy_data_methods_s,
 	.get_size = variable_interface_get_size,
@@ -91,7 +95,14 @@ static struct starpu_data_interface_ops_t interface_variable_ops = {
 	.display = display_variable_interface
 };
 
-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void *variable_handle_to_pointer(starpu_data_handle handle, uint32_t node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
+}
+
+static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
 {
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -100,18 +111,18 @@ static void register_variable_handle(starpu_data_handle handle, uint32_t home_no
 			starpu_data_get_interface_on_node(handle, node);
 
 		if (node == home_node) {
-			local_interface->ptr = STARPU_VARIABLE_GET_PTR(interface);
+			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
 		}
 		else {
 			local_interface->ptr = 0;
 		}
 
-		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(interface);
+		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(data_interface);
 	}
 }
 
 #ifdef STARPU_USE_GORDON
-int convert_variable_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	*ptr = STARPU_VARIABLE_GET_PTR(interface);
 	(*ss).size = STARPU_VARIABLE_GET_ELEMSIZE(interface);
@@ -138,10 +149,10 @@ static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle)
 	return _starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
 }
 
-static int variable_compare(void *interface_a, void *interface_b)
+static int variable_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_variable_interface_t *variable_a = interface_a;
-	starpu_variable_interface_t *variable_b = interface_b;
+	starpu_variable_interface_t *variable_a = data_interface_a;
+	starpu_variable_interface_t *variable_b = data_interface_b;
 
 	/* Two variables are considered compatible if they have the same size */
 	return (variable_a->elemsize == variable_b->elemsize);
@@ -149,18 +160,18 @@ static int variable_compare(void *interface_a, void *interface_b)
 
 static void display_variable_interface(starpu_data_handle handle, FILE *f)
 {
-	starpu_variable_interface_t *interface =
+	starpu_variable_interface_t *variable_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	fprintf(f, "%ld\t", (long)interface->elemsize);
+	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 }
 
 static size_t variable_interface_get_size(starpu_data_handle handle)
 {
-	starpu_variable_interface_t *interface =
+	starpu_variable_interface_t *variable_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return variable_interface->elemsize;
 }
 
 uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
@@ -181,15 +192,15 @@ size_t starpu_variable_get_elemsize(starpu_data_handle handle)
 /* memory allocation/deallocation primitives for the variable interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	starpu_variable_interface_t *interface = interface_;
+	starpu_variable_interface_t *variable_interface = data_interface_;
 
 	unsigned fail = 0;
 	uintptr_t addr = 0;
 	ssize_t allocated_memory;
 
-	size_t elemsize = interface->elemsize;
+	size_t elemsize = variable_interface->elemsize;
 
 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
 
@@ -239,26 +250,26 @@ static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_n
 	allocated_memory = elemsize;
 
 	/* update the data properly in consequence */
-	interface->ptr = addr;
+	variable_interface->ptr = addr;
 	
 	return allocated_memory;
 }
 
-static void free_variable_buffer_on_node(void *interface, uint32_t node)
+static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 {
 	starpu_node_kind kind = _starpu_get_node_kind(node);
 	switch(kind) {
 		case STARPU_CPU_RAM:
-			free((void*)STARPU_VARIABLE_GET_PTR(interface));
+			free((void*)STARPU_VARIABLE_GET_PTR(data_interface));
 			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)STARPU_VARIABLE_GET_PTR(interface));
+			cudaFree((void*)STARPU_VARIABLE_GET_PTR(data_interface));
 			break;
 #endif
 #ifdef STARPU_USE_OPENCL
                 case STARPU_OPENCL_RAM:
-                        clReleaseMemObject((void*)STARPU_VARIABLE_GET_PTR(interface));
+                        clReleaseMemObject((void*)STARPU_VARIABLE_GET_PTR(data_interface));
                         break;
 #endif
 		default:
@@ -267,8 +278,8 @@ static void free_variable_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
@@ -285,26 +296,50 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 }
 
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		int src_dev = starpu_memory_node_to_devid(src_node);
+		int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+		starpu_variable_interface_t *src_variable = src_interface;
+		starpu_variable_interface_t *dst_variable = dst_interface;
+
+		cudaError_t cures;
+		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+#else
+		/* This is illegal without support for cudaMemcpyPeer */
+		STARPU_ABORT();
+#endif
+		return 0;
+	}
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)),
+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
 					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
@@ -329,22 +364,63 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 }
 
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		int src_dev = starpu_memory_node_to_devid(src_node);
+		int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+		starpu_variable_interface_t *src_variable = src_interface;
+		starpu_variable_interface_t *dst_variable = dst_interface;
+
+		size_t length = src_variable->elemsize;
+
+		cudaError_t cures;
+		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
+		if (cures)
+		{
+			/* sychronous fallback */
+			cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length);
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+
+		return -EAGAIN;
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
+}
+
+
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface,
-                                    unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface,
+                                    unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
@@ -360,7 +436,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
@@ -377,17 +453,17 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	cl_int err;
 
@@ -412,7 +488,7 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attrib
 
 #endif
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;

+ 5 - 5
src/datawizard/interfaces/vector_filters.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 #include <common/config.h>
 #include <datawizard/filters.h>
 
-void starpu_block_filter_func_vector(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+void starpu_block_filter_func_vector(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
         starpu_vector_interface_t *vector_father = father_interface;
         starpu_vector_interface_t *vector_child = child_interface;
@@ -47,7 +47,7 @@ void starpu_block_filter_func_vector(void *father_interface, void *child_interfa
 }
 
 
-void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nchunks)
+void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
 {
         /* there cannot be more than 2 chunks */
         STARPU_ASSERT(id < 2);
@@ -88,7 +88,7 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 }
 
 
-void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nchunks)
+void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
 {
         starpu_vector_interface_t *vector_father = father_interface;
         starpu_vector_interface_t *vector_child = child_interface;
@@ -107,7 +107,7 @@ void starpu_vector_list_filter_func(void *father_interface, void *child_interfac
 	if (vector_father->ptr) {
 	  /* compute the current position */
 	  unsigned i;
-	  for (i = 0; i <= id; i++) 
+	  for (i = 0; i < id; i++) 
 	    current_pos += length_tab[i];
 	  
 	  vector_child->ptr = vector_father->ptr + current_pos*elemsize;

+ 154 - 71
src/datawizard/interfaces/vector_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,20 +25,21 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, void *_event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, void *_event);
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
 static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
@@ -50,6 +51,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -64,20 +66,22 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 	.spu_to_spu = NULL
 };
 
-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_vector_buffer_on_node(void *interface, uint32_t node);
+static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void *vector_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
+static void free_vector_buffer_on_node(void *data_interface, uint32_t node);
 static size_t vector_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle);
-static int vector_compare(void *interface_a, void *interface_b);
+static int vector_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_interface(starpu_data_handle handle, FILE *f);
 #ifdef STARPU_USE_GORDON
-static int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 
 static struct starpu_data_interface_ops_t interface_vector_ops = {
 	.register_data_handle = register_vector_handle,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
+	.handle_to_pointer = vector_handle_to_pointer,
 	.free_data_on_node = free_vector_buffer_on_node,
 	.copy_methods = &vector_copy_data_methods_s,
 	.get_size = vector_interface_get_size,
@@ -91,9 +95,19 @@ static struct starpu_data_interface_ops_t interface_vector_ops = {
 	.display = display_vector_interface
 };
 
-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
+static void *vector_handle_to_pointer(starpu_data_handle handle, uint32_t node)
 {
-	starpu_vector_interface_t *vector_interface = interface;
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	starpu_vector_interface_t *vector_interface =
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) vector_interface->ptr;
+}
+
+static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
+{
+	starpu_vector_interface_t *vector_interface = data_interface;
 
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -118,7 +132,7 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 }
 
 #ifdef STARPU_USE_GORDON
-int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	starpu_vector_interface_t *vector_interface = interface;
 	
@@ -150,10 +164,10 @@ static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle)
 	return _starpu_crc32_be(starpu_vector_get_nx(handle), 0);
 }
 
-static int vector_compare(void *interface_a, void *interface_b)
+static int vector_compare(void *data_interface_a, void *data_interface_b)
 {
-	starpu_vector_interface_t *vector_a = interface_a;
-	starpu_vector_interface_t *vector_b = interface_b;
+	starpu_vector_interface_t *vector_a = data_interface_a;
+	starpu_vector_interface_t *vector_b = data_interface_b;
 
 	/* Two vectors are considered compatible if they have the same size */
 	return ((vector_a->nx == vector_b->nx)
@@ -162,19 +176,19 @@ static int vector_compare(void *interface_a, void *interface_b)
 
 static void display_vector_interface(starpu_data_handle handle, FILE *f)
 {
-	starpu_vector_interface_t *interface =
+	starpu_vector_interface_t *vector_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	fprintf(f, "%u\t", interface->nx);
+	fprintf(f, "%u\t", vector_interface->nx);
 }
 
 static size_t vector_interface_get_size(starpu_data_handle handle)
 {
 	size_t size;
-	starpu_vector_interface_t *interface =
+	starpu_vector_interface_t *vector_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	size = interface->nx*interface->elemsize;
+	size = vector_interface->nx*vector_interface->elemsize;
 
 	return size;
 }
@@ -182,10 +196,10 @@ static size_t vector_interface_get_size(starpu_data_handle handle)
 /* offer an access to the data parameters */
 uint32_t starpu_vector_get_nx(starpu_data_handle handle)
 {
-	starpu_vector_interface_t *interface =
+	starpu_vector_interface_t *vector_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->nx;
+	return vector_interface->nx;
 }
 
 uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
@@ -195,33 +209,33 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
 
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-	starpu_vector_interface_t *interface =
+	starpu_vector_interface_t *vector_interface =
 		starpu_data_get_interface_on_node(handle, node);
 
-	return interface->ptr;
+	return vector_interface->ptr;
 }
 
 size_t starpu_vector_get_elemsize(starpu_data_handle handle)
 {
-	starpu_vector_interface_t *interface =
+	starpu_vector_interface_t *vector_interface =
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return interface->elemsize;
+	return vector_interface->elemsize;
 }
 
 /* memory allocation/deallocation primitives for the vector interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_node)
+static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node)
 {
-	starpu_vector_interface_t *interface = interface_;
+	starpu_vector_interface_t *vector_interface = data_interface_;
 
 	unsigned fail = 0;
 	uintptr_t addr = 0;
 	ssize_t allocated_memory;
 
-	uint32_t nx = interface->nx;
-	size_t elemsize = interface->elemsize;
+	uint32_t nx = vector_interface->nx;
+	size_t elemsize = vector_interface->elemsize;
 
 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
 
@@ -271,16 +285,20 @@ static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_nod
 	allocated_memory = nx*elemsize;
 
 	/* update the data properly in consequence */
-	interface->ptr = addr;
-        interface->dev_handle = addr;
-        interface->offset = 0;
+	vector_interface->ptr = addr;
+        vector_interface->dev_handle = addr;
+        vector_interface->offset = 0;
 	
 	return allocated_memory;
 }
 
-static void free_vector_buffer_on_node(void *interface, uint32_t node)
+static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 {
-	starpu_vector_interface_t *vector_interface = interface;
+	starpu_vector_interface_t *vector_interface = data_interface;
+
+#ifdef STARPU_USE_CUDA
+	cudaError_t cures;
+#endif
 
 	starpu_node_kind kind = _starpu_get_node_kind(node);
 	switch(kind) {
@@ -289,7 +307,8 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)vector_interface->ptr);
+			cures = cudaFree((void*)vector_interface->ptr);
+			STARPU_ASSERT(cures == cudaSuccess);
 			break;
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -303,13 +322,14 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
+
 	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
@@ -319,33 +339,80 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
+#ifdef HAVE_CUDA_MEMCPY_PEER
+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
+				void *dst_interface, unsigned dst_node,
+				int is_async, cudaStream_t stream)
+{
+	cudaError_t cures;
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+	starpu_vector_interface_t *src_vector = src_interface;
+	starpu_vector_interface_t *dst_vector = dst_interface;
+
+	size_t length = src_vector->nx*src_vector->elemsize;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	if (is_async)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
+						(char *)src_vector->ptr, src_dev,
+						length, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpyPeer((char *)dst_vector->ptr, dst_dev,
+				(char *)src_vector->ptr, src_dev, length);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+
+	return 0;
+}
+#endif
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)),
+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
 					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
+
 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
 	if (cures)
 	{
@@ -362,15 +429,31 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 	return -EAGAIN;
 }
 
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
+}
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
@@ -378,8 +461,8 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)),
-                                    void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                                    void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
@@ -396,8 +479,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-                                    void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                                    void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
@@ -413,20 +496,20 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 	return ret;
 }
 
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)),
-                              void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)),
-                              void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
         int err;
 
@@ -450,8 +533,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attrib
 
 #endif // STARPU_USE_OPENCL
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;

+ 32 - 30
src/datawizard/interfaces/void_interface.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -38,8 +39,10 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = dummy_copy,
 	.cuda_to_ram = dummy_copy,
+	.cuda_to_cuda = dummy_copy,
 	.ram_to_cuda_async = dummy_cuda_copy_async,
 	.cuda_to_ram_async = dummy_cuda_copy_async,
+	.cuda_to_cuda_async = dummy_cuda_copy_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = dummy_copy,
@@ -47,19 +50,18 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
         .ram_to_opencl_async = dummy_opencl_copy_async,
 	.opencl_to_ram_async = dummy_opencl_copy_async,
 #endif
-	.cuda_to_cuda = dummy_copy,
 	.cuda_to_spu = dummy_copy,
 	.spu_to_ram = dummy_copy,
 	.spu_to_cuda = dummy_copy,
 	.spu_to_spu = dummy_copy
 };
 
-static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
-static ssize_t allocate_void_buffer_on_node(void *interface_, uint32_t dst_node);
-static void free_void_buffer_on_node(void *interface, uint32_t node);
+static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
+static ssize_t allocate_void_buffer_on_node(void *data_interface_, uint32_t dst_node);
+static void free_void_buffer_on_node(void *data_interface, uint32_t node);
 static size_t void_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_void_interface_crc32(starpu_data_handle handle);
-static int void_compare(void *interface_a, void *interface_b);
+static int void_compare(void *data_interface_a, void *data_interface_b);
 static void display_void_interface(starpu_data_handle handle, FILE *f);
 
 static struct starpu_data_interface_ops_t interface_void_ops = {
@@ -75,9 +77,9 @@ static struct starpu_data_interface_ops_t interface_void_ops = {
 	.display = display_void_interface
 };
 
-static void register_void_handle(starpu_data_handle handle __attribute__((unused)),
-				uint32_t home_node __attribute__((unused)),
-				void *interface __attribute__((unused)))
+static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED,
+				uint32_t home_node STARPU_ATTRIBUTE_UNUSED,
+				void *data_interface STARPU_ATTRIBUTE_UNUSED)
 {
 	/* Since there is no real data to register, we don't do anything */
 }
@@ -89,25 +91,25 @@ void starpu_void_data_register(starpu_data_handle *handleptr)
 }
 
 
-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle __attribute__((unused)))
+static uint32_t footprint_void_interface_crc32(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }
 
-static int void_compare(void *interface_a __attribute__((unused)),
-			void *interface_b __attribute__((unused)))
+static int void_compare(void *data_interface_a STARPU_ATTRIBUTE_UNUSED,
+			void *data_interface_b STARPU_ATTRIBUTE_UNUSED)
 {
 	/* There is no allocation required, and therefore nothing to cache
 	 * anyway. */
 	return 1;
 }
 
-static void display_void_interface(starpu_data_handle handle __attribute__((unused)), FILE *f)
+static void display_void_interface(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
 {
 	fprintf(f, "void\t");
 }
 
-static size_t void_interface_get_size(starpu_data_handle handle __attribute__((unused)))
+static size_t void_interface_get_size(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }
@@ -115,32 +117,32 @@ static size_t void_interface_get_size(starpu_data_handle handle __attribute__((u
 /* memory allocation/deallocation primitives for the void interface */
 
 /* returns the size of the allocated area */
-static ssize_t allocate_void_buffer_on_node(void *interface __attribute__((unused)),
-					uint32_t dst_node __attribute__((unused)))
+static ssize_t allocate_void_buffer_on_node(void *data_interface STARPU_ATTRIBUTE_UNUSED,
+					uint32_t dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	/* Successfuly allocated 0 bytes */
 	return 0;
 }
 
-static void free_void_buffer_on_node(void *interface __attribute__((unused)) ,
-					uint32_t node __attribute__((unused)))
+static void free_void_buffer_on_node(void *data_interface STARPU_ATTRIBUTE_UNUSED ,
+					uint32_t node STARPU_ATTRIBUTE_UNUSED)
 {
 	/* There is no buffer actually */
 }
 
-static int dummy_copy(void *src_interface __attribute__((unused)),
-			unsigned src_node __attribute__((unused)),
-			void *dst_interface __attribute__((unused)),
-			unsigned dst_node __attribute__((unused)))
+static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
+			unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+			void *dst_interface STARPU_ATTRIBUTE_UNUSED,
+			unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }
 
 #ifdef STARPU_USE_CUDA
-static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
-				unsigned src_node __attribute__((unused)),
-				void *dst_interface __attribute__((unused)),
-				unsigned dst_node __attribute__((unused)),
+static int dummy_cuda_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
+				unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+				void *dst_interface STARPU_ATTRIBUTE_UNUSED,
+				unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
 				cudaStream_t stream __attribute__ ((unused)))
 {
 	return 0;
@@ -148,11 +150,11 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL
-static int dummy_opencl_copy_async(void *src_interface __attribute__((unused)),
-					unsigned src_node __attribute__((unused)),
-					void *dst_interface __attribute__((unused)),
-					unsigned dst_node __attribute__((unused)),
-					void *_event __attribute__((unused)))
+static int dummy_opencl_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
+					unsigned src_node STARPU_ATTRIBUTE_UNUSED,
+					void *dst_interface STARPU_ATTRIBUTE_UNUSED,
+					unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
+					void *_event STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;
 }

+ 41 - 3
src/datawizard/memalloc.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -134,7 +134,9 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 			src_replicate->state = STARPU_INVALID;
 			dst_replicate->state = STARPU_OWNER;
 
+#ifdef STARPU_DEVEL
 #warning we should use requests during memory reclaim
+#endif
 			/* TODO use request !! */
 			src_replicate->refcnt++;
 			dst_replicate->refcnt++;
@@ -201,7 +203,9 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 //	while (_starpu_spin_trylock(&handle->header_lock))
 //		_starpu_datawizard_progress(_starpu_get_local_memory_node());
 
+#ifdef STARPU_DEVEL
 #warning can we block here ?
+#endif
 //	_starpu_spin_lock(&handle->header_lock);
 
 	if (mc->automatically_allocated && 
@@ -210,6 +214,18 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 		if (handle && !data_was_deleted)
 			STARPU_ASSERT(replicate->allocated);
 
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
+		{
+			/* To facilitate the design of interface, we set the
+			 * proper CUDA device in case it is needed. This avoids
+			 * having to set it again in the free method of each
+			 * interface. */
+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
+			STARPU_ASSERT(err == cudaSuccess);
+		}
+#endif
+
 		mc->ops->free_data_on_node(mc->chunk_interface, node);
 
 		if (handle && !data_was_deleted)
@@ -379,8 +395,8 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 	return success;
 }
 
-static int _starpu_data_interface_compare(void *interface_a, struct starpu_data_interface_ops_t *ops_a,
-						void *interface_b, struct starpu_data_interface_ops_t *ops_b)
+static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops_t *ops_a,
+                                          void *data_interface_b, struct starpu_data_interface_ops_t *ops_b)
 {
 	if (ops_a->interfaceid != ops_b->interfaceid)
 		return -1;
@@ -674,6 +690,19 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 
 		STARPU_TRACE_START_ALLOC(dst_node);
 		STARPU_ASSERT(replicate->data_interface);
+
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+		{
+			/* To facilitate the design of interface, we set the
+			 * proper CUDA device in case it is needed. This avoids
+			 * having to set it again in the malloc method of each
+			 * interface. */
+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
+			STARPU_ASSERT(err == cudaSuccess);
+		}
+#endif
+
 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
 		STARPU_TRACE_END_ALLOC(dst_node);
 
@@ -721,6 +750,15 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 
+	if (dst_node == 0)
+	{
+		void *ptr = starpu_handle_to_pointer(handle, 0);
+		if (ptr != NULL)
+		{
+			_starpu_data_register_ram_pointer(handle, ptr);
+		}
+	}
+
 	return 0;
 }
 

+ 8 - 1
src/datawizard/memory_nodes.c

@@ -81,12 +81,17 @@ inline starpu_node_kind _starpu_get_node_kind(uint32_t node)
 	return descr.nodes[node];
 }
 
+int starpu_memory_node_to_devid(unsigned node)
+{
+	return descr.devid[node];
+}
+
 unsigned _starpu_get_memory_nodes_count(void)
 {
 	return descr.nnodes;
 }
 
-unsigned _starpu_register_memory_node(starpu_node_kind kind)
+unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid)
 {
 	unsigned nnodes;
 	/* ATOMIC_ADD returns the new value ... */
@@ -95,6 +100,8 @@ unsigned _starpu_register_memory_node(starpu_node_kind kind)
 	descr.nodes[nnodes-1] = kind;
 	STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
 
+	descr.devid[nnodes-1] = devid;
+
 	/* for now, there is no condition associated to that newly created node */
 	descr.condition_count[nnodes-1] = 0;
 

+ 5 - 1
src/datawizard/memory_nodes.h

@@ -46,6 +46,9 @@ typedef struct {
 	unsigned nnodes;
 	starpu_node_kind nodes[STARPU_MAXNODES];
 
+	/* Get the device id associated to this node, or -1 if not applicable */
+	int devid[STARPU_MAXNODES];
+
 	// TODO move this 2 lists outside starpu_mem_node_descr
 	/* Every worker is associated to a condition variable on which the
 	 * worker waits when there is task available. It is possible that
@@ -65,11 +68,12 @@ void _starpu_init_memory_nodes(void);
 void _starpu_deinit_memory_nodes(void);
 void _starpu_set_local_memory_node_key(unsigned *node);
 unsigned _starpu_get_local_memory_node(void);
-unsigned _starpu_register_memory_node(starpu_node_kind kind);
+unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid);
 //void _starpu_memory_node_attach_queue(struct starpu_jobq_s *q, unsigned nodeid);
 void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_t *mutex, unsigned memory_node);
 
 starpu_node_kind _starpu_get_node_kind(uint32_t node);
+int starpu_memory_node_to_devid(unsigned node);
 unsigned _starpu_get_memory_nodes_count(void);
 
 starpu_mem_node_descr *_starpu_get_memory_node_description(void);

+ 2 - 1
src/datawizard/sort_data_handles.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #include <common/config.h>
 
 #include <datawizard/filters.h>
+#include <datawizard/sort_data_handles.h>
 
 /* To avoid deadlocks in case we have multiple tasks accessing the same piece
  * of data  (eg. task T1 needs A and B, and T2 needs B and A), we need to lock

+ 23 - 4
src/datawizard/user_interactions.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,7 @@ int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
 
 	STARPU_ASSERT(handle);
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0, 1);
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0);
 
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
@@ -125,7 +125,9 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 	PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
 	wrapper->finished = 0;
 
+#ifdef STARPU_DEVEL
 #warning TODO instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached
+#endif
 	_starpu_spin_lock(&handle->header_lock);
 	handle->per_node[0].refcnt++;
 	_starpu_spin_unlock(&handle->header_lock);
@@ -305,6 +307,7 @@ static void _prefetch_data_on_node(void *arg)
 
 }
 
+static
 int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
 {
 	STARPU_ASSERT(handle);
@@ -413,7 +416,9 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 /* Query the status of the handle on the specified memory node. */
 void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 {
+#ifdef STARPU_DEVEL
 #warning FIXME
+#endif
 //	_starpu_spin_lock(&handle->header_lock);
 
 	if (is_allocated)
@@ -423,7 +428,21 @@ void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *i
 		*is_valid = (handle->per_node[memory_node].state != STARPU_INVALID);
 
 	if (is_requested)
-		*is_requested = handle->per_node[memory_node].requested;
+	{
+		int requested = 0;
+
+		unsigned node;
+		for (node = 0; node < STARPU_MAXNODES; node++)
+		{
+			if (handle->per_node[memory_node].requested[node])
+			{
+				requested = 1;
+				break;
+			}
+		}
+
+		*is_requested = requested;
+	}
 
 //	_starpu_spin_unlock(&handle->header_lock);
 }

+ 12 - 21
src/datawizard/write_back.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
  */
 
 #include <datawizard/datawizard.h>
+#include <datawizard/write_back.h>
 
 void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
 					   uint32_t write_through_mask)
@@ -25,9 +26,6 @@ void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_n
 		return;
 	}
 
-	while (_starpu_spin_trylock(&handle->header_lock))
-		_starpu_datawizard_progress(requesting_node, 1);
-
 	/* first commit all changes onto the nodes specified by the mask */
 	uint32_t node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
@@ -36,30 +34,23 @@ void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_n
 			/* we need to commit the buffer on that node */
 			if (node != requesting_node) 
 			{
-				uint32_t handling_node =
-					_starpu_select_node_to_handle_request(requesting_node, node);
+				while (_starpu_spin_trylock(&handle->header_lock))
+					_starpu_datawizard_progress(requesting_node, 1);
 
 				starpu_data_request_t r;
+				r = create_request_to_fetch_data(handle, &handle->per_node[node],
+								STARPU_R, 0, NULL, NULL);
 
-				/* check that there is not already a similar
-				 * request that we should reuse */
-				r = _starpu_search_existing_data_request(&handle->per_node[node], STARPU_R);
-				if (!r) {
-					/* there was no existing request so we create one now */
-					r = _starpu_create_data_request(handle, &handle->per_node[requesting_node],
-							&handle->per_node[node], handling_node, STARPU_R, 0, 1);
-					_starpu_post_data_request(r, handling_node);
-				}
-				else {
-					/* if there is already a similar request, it is
-					 * useless to post another one */
-					_starpu_spin_unlock(&r->lock);
+			        /* If no request was created, the handle was already up-to-date on the
+			         * node */
+			        if (r)
+				{
+				        _starpu_spin_unlock(&handle->header_lock);
+        				_starpu_wait_data_request_completion(r, 1);
 				}
 			}
 		}
 	}
-
-	_starpu_spin_unlock(&handle->header_lock);
 }
 
 void starpu_data_set_wt_mask(starpu_data_handle handle, uint32_t wt_mask)

+ 7 - 6
src/debug/structures_size.c

@@ -19,19 +19,20 @@
 #include <core/workers.h>
 #include <datawizard/coherency.h>
 #include <profiling/bound.h>
+#include <debug/starpu_debug_helpers.h>
 
 void _starpu_debug_display_structures_size(void)
 {
-	fprintf(stderr, "struct starpu_task\t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_task\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_task), (unsigned) sizeof(struct starpu_task));
-	fprintf(stderr, "struct starpu_job_s\t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_job_s\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_job_s), (unsigned) sizeof(struct starpu_job_s));
-	fprintf(stderr, "struct starpu_data_state_t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_data_state_t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_data_state_t), (unsigned) sizeof(struct starpu_data_state_t));
-	fprintf(stderr, "struct starpu_tag_s\t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_tag_s\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_tag_s), (unsigned) sizeof(struct starpu_tag_s));
-	fprintf(stderr, "struct starpu_cg_s\t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_cg_s\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_cg_s), (unsigned) sizeof(struct starpu_cg_s));
-	fprintf(stderr, "struct starpu_worker_s\t\t%d bytes\t(%x)\n",
+	fprintf(stderr, "struct starpu_worker_s\t\t%u bytes\t(%x)\n",
 			(unsigned) sizeof(struct starpu_worker_s), (unsigned) sizeof(struct starpu_worker_s));
 }

File diff suppressed because it is too large
+ 1252 - 0
src/debug/traces/starpu_fxt.c


+ 63 - 0
src/debug/traces/starpu_fxt.h

@@ -0,0 +1,63 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_FXT_H__
+#define __STARPU_FXT_H__
+
+#include <starpu.h>
+#include <starpu_config.h>
+#include <common/config.h>
+
+#ifdef STARPU_USE_FXT
+
+#include <search.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <common/fxt.h>
+#include <common/list.h>
+#include "../mpi/starpu_mpi_fxt.h"
+#include <starpu.h>
+
+#define FACTOR  100
+
+void starpu_fxt_dag_init(char *dag_filename);
+void starpu_fxt_dag_terminate(void);
+void starpu_fxt_dag_add_tag_deps(uint64_t child, uint64_t father);
+void starpu_fxt_dag_set_tag_done(uint64_t tag, const char *color);
+void starpu_fxt_dag_add_task_deps(unsigned long dep_prev, unsigned long dep_succ);
+void starpu_fxt_dag_set_task_done(unsigned long job_id, const char *label, const char *color);
+void starpu_fxt_dag_add_sync_point(void);
+
+/*
+ *	MPI
+ */
+
+int starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank);
+void starpu_fxt_mpi_add_send_transfer(int src, int dst, int mpi_tag, size_t size, float date);
+void starpu_fxt_mpi_add_recv_transfer(int src, int dst, int mpi_tag, float date);
+void starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file);
+
+void starpu_fxt_write_paje_header(FILE *file);
+
+#endif // STARPU_USE_FXT
+
+#endif // __STARPU_FXT_H__

+ 107 - 0
src/debug/traces/starpu_fxt_dag.c

@@ -0,0 +1,107 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <common/config.h>
+
+#ifdef STARPU_USE_FXT
+
+#include "starpu_fxt.h"
+
+static FILE *out_file;
+static unsigned cluster_cnt;
+
+void starpu_fxt_dag_init(char *out_path)
+{
+	if (!out_path)
+	{
+		out_file = NULL;
+		return;
+	}
+
+	/* create a new file */
+	out_file = fopen(out_path, "w+");
+	if (!out_file) {
+		fprintf(stderr,"error while opening %s\n", out_path);
+		perror("fopen");
+		exit(1);
+	}
+	cluster_cnt = 0;
+
+	fprintf(out_file, "digraph G {\n");
+	fprintf(out_file, "\tcolor=white\n");
+	fprintf(out_file, "\trankdir=LR;\n");
+
+	/* Create a new cluster */
+	fprintf(out_file, "subgraph cluster_%u {\n", cluster_cnt);
+	fprintf(out_file, "\tcolor=black;\n");
+}
+
+void starpu_fxt_dag_terminate(void)
+{
+	if (!out_file)
+		return;
+
+	/* Close the last cluster */
+	fprintf(out_file, "}\n");
+	/* Close the graph */
+	fprintf(out_file, "}\n");
+	fclose(out_file);
+}
+
+void starpu_fxt_dag_add_tag_deps(uint64_t child, uint64_t father)
+{
+	if (out_file)
+	fprintf(out_file, "\t \"tag_%llx\"->\"tag_%llx\"\n", 
+		(unsigned long long)father, (unsigned long long)child);
+}
+
+void starpu_fxt_dag_add_task_deps(unsigned long dep_prev, unsigned long dep_succ)
+{
+	if (out_file)
+	fprintf(out_file, "\t \"task_%lx\"->\"task_%lx\"\n", dep_prev, dep_succ);
+} 
+
+void starpu_fxt_dag_set_tag_done(uint64_t tag, const char *color)
+{
+	if (out_file)
+	fprintf(out_file, "\t \"tag_%llx\" [ style=filled, label=\"\", color=\"%s\"]\n", 
+		(unsigned long long)tag, color);
+}
+
+void starpu_fxt_dag_set_task_done(unsigned long job_id, const char *label, const char *color)
+{
+	if (out_file)
+	fprintf(out_file, "\t \"task_%lx\" [ style=filled, label=\"%s\", color=\"%s\"]\n", job_id, label, color);
+}
+
+void starpu_fxt_dag_add_sync_point(void)
+{
+	if (!out_file)
+		return;
+
+	/* Close the previous cluster */
+	fprintf(out_file, "}\n");
+
+	cluster_cnt++;
+
+	/* Create a new cluster */
+	fprintf(out_file, "subgraph cluster_%u {\n", cluster_cnt);
+	fprintf(out_file, "\tcolor=black;\n");
+}
+
+#endif /* STARPU_USE_FXT */

+ 239 - 0
src/debug/traces/starpu_fxt_mpi.c

@@ -0,0 +1,239 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/config.h>
+
+#ifdef STARPU_USE_FXT
+
+#include "starpu_fxt.h"
+
+struct mpi_transfer {
+	unsigned matched;
+	int other_rank; /* src for a recv, dest for a send */
+	int mpi_tag;
+	size_t size;
+	float date;
+};
+
+/* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
+ * filled with the timestamp of the barrier */
+int starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
+{
+	STARPU_ASSERT(offset);
+
+	/* Open the trace file */
+	int fd_in;
+	fd_in = open(filename_in, O_RDONLY);
+	if (fd_in < 0) {
+	        perror("open failed :");
+	        exit(-1);
+	}
+
+	static fxt_t fut;
+	fut = fxt_fdopen(fd_in);
+	if (!fut) {
+	        perror("fxt_fdopen :");
+	        exit(-1);
+	}
+	
+	fxt_blockev_t block;
+	block = fxt_blockev_enter(fut);
+
+	struct fxt_ev_64 ev;
+
+	int func_ret = -1;
+	unsigned found = 0;
+	while(!found) {
+		int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
+		if (ret != FXT_EV_OK) {
+			fprintf(stderr, "no more block ...\n");
+			break;
+		}
+
+		if (ev.code == FUT_MPI_BARRIER)
+		{
+			/* We found the sync point */
+			*offset = ev.time;
+			*rank = ev.param[0];
+			*key = ev.param[2];
+			found = 1;
+			func_ret = 0;
+		}
+	}
+
+	/* Close the trace file */
+	if (close(fd_in))
+	{
+	        perror("close failed :");
+	        exit(-1);
+	}
+
+	return func_ret;
+}
+
+/*
+ *	Deal with the actual MPI transfers performed with the MPI lib
+ */
+
+/* the list of MPI transfers found in the different traces */
+static struct mpi_transfer *mpi_sends[64] = {NULL};
+static struct mpi_transfer *mpi_recvs[64] = {NULL};
+
+/* number of available slots in the lists  */
+unsigned mpi_sends_list_size[64] = {0};
+unsigned mpi_recvs_list_size[64] = {0};
+
+/* number of slots actually used in the list  */
+unsigned mpi_sends_used[64] = {0};
+unsigned mpi_recvs_used[64] = {0};
+
+/* number of slots already matched at the beginning of the list. This permits
+ * going through the lists from the beginning to match each and every
+ * transfer, thus avoiding a quadratic complexity. */
+unsigned mpi_recvs_matched[64] = {0};
+
+void starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, int mpi_tag, size_t size, float date)
+{
+	unsigned slot = mpi_sends_used[src]++;
+
+	if (mpi_sends_used[src] > mpi_sends_list_size[src])
+	{
+		if (mpi_sends_list_size[src] > 0)
+		{
+			mpi_sends_list_size[src] *= 2;
+		}
+		else {
+			mpi_sends_list_size[src] = 1;
+		}
+
+		mpi_sends[src] = realloc(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
+	}
+
+	mpi_sends[src][slot].matched = 0;
+	mpi_sends[src][slot].other_rank = dst;
+	mpi_sends[src][slot].mpi_tag = mpi_tag;
+	mpi_sends[src][slot].size = size;
+	mpi_sends[src][slot].date = date;
+}
+
+void starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag, float date)
+{
+	unsigned slot = mpi_recvs_used[dst]++;
+
+	if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
+	{
+		if (mpi_recvs_list_size[dst] > 0)
+		{
+			mpi_recvs_list_size[dst] *= 2;
+		}
+		else {
+			mpi_recvs_list_size[dst] = 1;
+		}
+
+		mpi_recvs[dst] = realloc(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
+	}
+
+	mpi_recvs[dst][slot].matched = 0;
+	mpi_recvs[dst][slot].other_rank = dst;
+	mpi_recvs[dst][slot].mpi_tag = mpi_tag;
+	mpi_recvs[dst][slot].date = date;
+}
+
+static
+struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag)
+{
+	unsigned slot;
+	unsigned firstslot = mpi_recvs_matched[dst];
+
+	unsigned all_previous_were_matched = 1;
+
+	for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
+	{
+		if (!mpi_recvs[dst][slot].matched)
+		{
+			if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
+			{
+				/* we found a match ! */
+				mpi_recvs[dst][slot].matched = 1;
+				return &mpi_recvs[dst][slot];
+			}
+
+			all_previous_were_matched = 0;
+		}
+		else {
+			if (all_previous_were_matched)
+			{
+				/* All previous transfers are already matched,
+				 * we need not consider them anymore */
+				mpi_recvs_matched[dst] = slot;
+			}
+		}
+	}
+
+	/* If we reached that point, we could not find a match */
+	return NULL;
+}
+
+static unsigned long mpi_com_id = 0;
+
+static void display_all_transfers_from_trace(FILE *out_paje_file, int src)
+{
+	unsigned slot;
+	for (slot = 0; slot < mpi_sends_used[src]; slot++)
+	{
+		int dst = mpi_sends[src][slot].other_rank;
+		int mpi_tag = mpi_sends[src][slot].mpi_tag;
+		float start_date = mpi_sends[src][slot].date;
+		size_t size = mpi_sends[src][slot].size;
+
+		struct mpi_transfer *match;
+		match = try_to_match_send_transfer(src, dst, mpi_tag);
+
+		if (match)
+		{
+			float end_date = match->date;
+
+			unsigned long id = mpi_com_id++;
+			/* TODO replace 0 by a MPI program ? */
+			if (out_paje_file)
+			{
+				fprintf(out_paje_file, "18	%f	MPIL	MPIroot   %ld	mpi_%d_p	mpicom_%lu\n", start_date, size, /* XXX */src, id);
+				fprintf(out_paje_file, "19	%f	MPIL	MPIroot	  %ld	mpi_%d_p	mpicom_%lu\n", end_date, size, /* XXX */dst, id);
+			}
+		}
+		else
+		{
+			fprintf(stderr, "Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n",
+												src, dst, mpi_tag, start_date);
+		}
+
+	}
+}
+
+void starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file)
+{
+	unsigned inputfile;
+
+	/* display the MPI transfers if possible */
+	for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
+	{
+		int filerank = ranks[inputfile];
+		display_all_transfers_from_trace(out_paje_file, filerank);
+	}
+}
+
+#endif // STARPU_USE_FXT

+ 157 - 0
src/debug/traces/starpu_paje.c

@@ -0,0 +1,157 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "starpu_fxt.h"
+#include <common/config.h>
+
+#ifdef STARPU_USE_FXT
+
+void starpu_fxt_write_paje_header(FILE *file)
+{
+	fprintf(file, "%%EventDef	PajeDefineContainerType	1\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	ContainerType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDefineEventType	2\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	ContainerType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDefineStateType	3\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	ContainerType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDefineVariableType	4\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	ContainerType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDefineLinkType	5\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	ContainerType	string\n");
+	fprintf(file, "%%	SourceContainerType	string\n");
+	fprintf(file, "%%	DestContainerType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDefineEntityValue	6\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	EntityType	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%	Color	color\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeCreateContainer	7\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Alias	string\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeDestroyContainer	8\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Name	string\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeNewEvent	9\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef PajeSetState 10\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajePushState	11\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajePopState	12\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	double\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeAddVariable	14\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	double\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeSubVariable	15\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	double\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeStartLink	18\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%	SourceContainer	string\n");
+	fprintf(file, "%%	Key	string\n");
+	fprintf(file, "%%EndEventDef\n");
+	fprintf(file, "%%EventDef	PajeEndLink	19\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%	DestContainer	string\n");
+	fprintf(file, "%%	Key	string\n");
+	fprintf(file, "%%EndEventDef\n");
+
+	fprintf(file, "                                        \n \
+	1       MPIP      0       \"MPI Program\"                      	\n \
+	1       P      MPIP       \"Program\"                      	\n \
+	1       Mn      P       \"Memory Node\"                         \n \
+	1       T      Mn       \"Worker\"                               \n \
+	1       Sc       P       \"Scheduler State\"                        \n \
+	2       event   T       \"event type\"				\n \
+	3       S       T       \"Thread State\"                        \n \
+	3       MS       Mn       \"Memory Node State\"                        \n \
+	4       ntask    Sc       \"Number of tasks\"                        \n \
+	4       bw      Mn       \"Bandwidth\"                        \n \
+	6       I       S      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       S      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       S       Callback       \".0 .3 .8\"            \n \
+	6       B       S       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       S      Sleeping         \".9 .1 .0\"		\n \
+	6       P       S       Progressing         \".4 .1 .6\"		\n \
+	6       A       MS      Allocating         \".4 .1 .0\"		\n \
+	6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n \
+	6       R       MS      Reclaiming         \".0 .1 .4\"		\n \
+	6       Co       MS     DriverCopy         \".3 .5 .1\"		\n \
+	6       No       MS     Nothing         \".0 .0 .0\"		\n \
+	5       MPIL     MPIP	P	P      MPIL\n \
+	5       L       P	Mn	Mn      L\n");
+
+	fprintf(file, "7      0.0 MPIroot      MPIP      0       root\n");
+}
+
+#endif

+ 11 - 52
src/drivers/cpu/driver_cpu.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,8 +18,6 @@
 
 #include <math.h>
 #include <starpu.h>
-#include <starpu_profiling.h>
-#include <profiling/profiling.h>
 #include <drivers/driver_common/driver_common.h>
 #include <common/utils.h>
 #include <core/debug.h>
@@ -40,9 +38,6 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	STARPU_ASSERT(cl);
 	STARPU_ASSERT(cl->cpu_func);
 
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
 	if (rank == 0)
 	{
 		ret = _starpu_fetch_task_input(task, 0);
@@ -50,7 +45,6 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 		{
 			/* there was not enough memory so the codelet cannot be executed right now ... */
 			/* push the codelet back and try another one ... */
-			STARPU_ASSERT(ret == 0);
 			return -EAGAIN;
 		}
 	}
@@ -58,52 +52,27 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	if (is_parallel_task)
 		PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
 
-	STARPU_TRACE_START_CODELET_BODY(j);
-
-	struct starpu_task_profiling_info *profiling_info;
-	int profiling = starpu_profiling_status_get();
-
-	if (rank == 0)
-	{
-		profiling_info = task->profiling_info;
-	
-		if ((profiling && profiling_info) || calibrate_model)
-		{
-			starpu_clock_gettime(&codelet_start);
-			_starpu_worker_register_executing_start_date(workerid, &codelet_start);
-		}
+	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
 
-	}
-	
-	cpu_args->status = STATUS_EXECUTING;
-	task->status = STARPU_TASK_RUNNING;	
-	
 	/* In case this is a Fork-join parallel task, the worker does not
 	 * execute the kernel at all. */
 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
 	{
 		cl_func func = cl->cpu_func;
-		func(task->interface, task->cl_arg);
+		STARPU_ASSERT(func);
+		func(task->interfaces, task->cl_arg);
 	}
-	
-	if (is_parallel_task)
-		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
 
-	STARPU_TRACE_END_CODELET_BODY(j);
+	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
 
-	cpu_args->status = STATUS_UNKNOWN;
+	if (is_parallel_task)
+		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
 
 	if (rank == 0)
 	{
-		cl->per_worker_stats[workerid]++;
-		
-		if ((profiling && profiling_info) || calibrate_model)
-			starpu_clock_gettime(&codelet_end);
-
-		_starpu_push_task_output(task, 0);
-
-		_starpu_driver_update_job_feedback(j, cpu_args, profiling_info,
+		_starpu_driver_update_job_feedback(j, cpu_args,
 				perf_arch, &codelet_start, &codelet_end);
+		_starpu_push_task_output(task, 0);
 	}
 
 	return 0;
@@ -163,17 +132,6 @@ void *_starpu_cpu_worker(void *arg)
 		{
 			PTHREAD_MUTEX_LOCK(sched_mutex);
 			if (_starpu_worker_can_block(memnode)){
-/* 			struct starpu_sched_ctx **sched_ctx = cpu_arg->sched_ctx; */
-/* 			int i = 0; */
-/* 			int sleep = 0; */
-/* 			for(i = 0; i < cpu_arg->nctxs; i++){ */
-/* 			  if(sched_ctx[i]->sched_ctx_id  == 2 ){ */
-/* 			    sleep = 1; */
-/* 			    break; */
-/* 			  } */
-/* 			} */
-
-/* 			if(sleep) */
 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
 			}
 
@@ -228,7 +186,7 @@ void *_starpu_cpu_worker(void *arg)
 
 		struct starpu_sched_ctx *local_sched_ctx = _starpu_get_sched_ctx(j->task->sched_ctx);
 
-                res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
+        res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
 
 		_starpu_set_current_task(NULL);
 
@@ -259,4 +217,5 @@ void *_starpu_cpu_worker(void *arg)
 	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CPU_KEY);
 
 	pthread_exit(NULL);
+	return NULL;
 }

+ 65 - 38
src/drivers/cuda/driver_cuda.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,6 @@
 
 #include <starpu.h>
 #include <starpu_cuda.h>
-#include <starpu_profiling.h>
 #include <common/utils.h>
 #include <common/config.h>
 #include <core/debug.h>
@@ -26,7 +25,6 @@
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
 #include <core/sched_ctx.h>
-#include <profiling/profiling.h>
 
 /* the number of CUDA devices */
 static int ncudagpus;
@@ -109,9 +107,7 @@ static void init_context(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* force CUDA to initialize the context for real */
-	cures = cudaFree(0);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	cudaFree(0);
 
 	limit_gpu_mem_if_needed(devid);
 
@@ -147,7 +143,11 @@ unsigned _starpu_get_cuda_device_count(void)
 	cures = cudaGetDeviceCount(&cnt);
 	if (STARPU_UNLIKELY(cures))
 		 return 0;
-	
+
+	if (cnt > STARPU_MAXCUDADEVS) {
+		fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
+		cnt = STARPU_MAXCUDADEVS;
+	}
 	return (unsigned)cnt;
 }
 
@@ -161,6 +161,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 {
 	int ret;
 	uint32_t mask = 0;
+	cudaError_t cures;
 
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
@@ -178,9 +179,8 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 		calibrate_model = 1;
 
 	ret = _starpu_fetch_task_input(task, mask);
-
 	if (ret != 0) {
-		/* there was not enough memory, so th input of
+		/* there was not enough memory, so the input of
 		 * the codelet cannot be fetched ... put the 
 		 * codelet back, and try it later */
 		return -EAGAIN;
@@ -188,44 +188,28 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	if (calibrate_model)
 	{
-		cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
+		cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
 
-	STARPU_TRACE_START_CODELET_BODY(j);
+	_starpu_driver_start_job(args, j, &codelet_start, 0);
 
-	struct starpu_task_profiling_info *profiling_info;
-	int profiling = starpu_profiling_status_get();
-	profiling_info = task->profiling_info;
-
-	if ((profiling && profiling_info) || calibrate_model)
-	{
-		starpu_clock_gettime(&codelet_start);
-		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
-	}
-
-	args->status = STATUS_EXECUTING;
-	task->status = STARPU_TASK_RUNNING;	
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	/* We make sure we do manipulate the proper device */
+	cures = cudaSetDevice(args->devid);
+#endif
 
 	cl_func func = cl->cuda_func;
 	STARPU_ASSERT(func);
-	func(task->interface, task->cl_arg);
-
-	cl->per_worker_stats[workerid]++;
+	func(task->interfaces, task->cl_arg);
 
+	_starpu_driver_end_job(args, j, &codelet_end, 0);
 
-	if ((profiling && profiling_info) || calibrate_model)
-		starpu_clock_gettime(&codelet_end);
-
-	STARPU_TRACE_END_CODELET_BODY(j);	
-	args->status = STATUS_UNKNOWN;
+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
 
 	_starpu_push_task_output(task, mask);
 
-	_starpu_driver_update_job_feedback(j, args, profiling_info, args->perf_arch,
-			&codelet_start, &codelet_end);
-
 	return 0;
 }
 
@@ -260,8 +244,11 @@ void *_starpu_cuda_worker(void *arg)
 	struct cudaDeviceProp prop;
 	cudaGetDeviceProperties(&prop, devid);
 	strncpy(devname, prop.name, 128);
-	snprintf(args->name, 32, "CUDA %d (%s)", args->devid, devname);
-
+#if CUDA_VERSION >= 3020
+	snprintf(args->name, 48, "CUDA %d (%s %02x:%02x.0)", args->devid, devname, prop.pciBusID, prop.pciDeviceID);
+#else
+	snprintf(args->name, 48, "CUDA %d (%s)", args->devid, devname);
+#endif
 	_STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
 	STARPU_TRACE_WORKER_INIT_END
@@ -277,7 +264,7 @@ void *_starpu_cuda_worker(void *arg)
 	int res;
 
 	pthread_cond_t *sched_cond = args->sched_cond;
-        pthread_mutex_t *sched_mutex = args->sched_mutex;
+    pthread_mutex_t *sched_mutex = args->sched_mutex;
 
 	while (_starpu_machine_is_running())
 	{
@@ -287,7 +274,7 @@ void *_starpu_cuda_worker(void *arg)
 
 		task = _starpu_pop_task(args);
 
-                if (!task) 
+        if (!task) 
 		{
 			PTHREAD_MUTEX_LOCK(sched_mutex);
 			if (_starpu_worker_can_block(memnode))
@@ -369,3 +356,43 @@ void *_starpu_cuda_worker(void *arg)
 	return NULL;
 
 }
+
+void starpu_cublas_report_error(const char *func, cublasStatus status)
+{
+	char *errormsg;
+	switch (status) {
+		case CUBLAS_STATUS_SUCCESS:
+			errormsg = "success";
+			break;
+		case CUBLAS_STATUS_NOT_INITIALIZED:
+			errormsg = "not initialized";
+			break;
+		case CUBLAS_STATUS_ALLOC_FAILED:
+			errormsg = "alloc failed";
+			break;
+		case CUBLAS_STATUS_INVALID_VALUE:
+			errormsg = "invalid value";
+			break;
+		case CUBLAS_STATUS_ARCH_MISMATCH:
+			errormsg = "arch mismatch";
+			break;
+		case CUBLAS_STATUS_EXECUTION_FAILED:
+			errormsg = "execution failed";
+			break;
+		case CUBLAS_STATUS_INTERNAL_ERROR:
+			errormsg = "internal error";
+			break;
+		default:
+			errormsg = "unknown error";
+			break;
+	}
+	printf("oops in %s ... %s \n", func, errormsg);
+	assert(0);
+}
+
+void starpu_cuda_report_error(const char *func, cudaError_t status)
+{
+	const char *errormsg = cudaGetErrorString(status);
+	printf("oops in %s ... %s \n", func, errormsg);
+	assert(0);
+}

+ 65 - 3
src/drivers/driver_common/driver_common.c

@@ -22,12 +22,73 @@
 #include <common/utils.h>
 #include <core/debug.h>
 #include <drivers/driver_common/driver_common.h>
+#include <starpu_top.h>
 
+void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_start, int rank)
+{
+	struct starpu_task *task = j->task;
+	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_task_profiling_info *profiling_info;
+	int profiling = starpu_profiling_status_get();
+	int starpu_top=starpu_top_status_get();
+	int workerid = args->workerid;
+	unsigned calibrate_model = 0;
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	args->status = STATUS_EXECUTING;
+	task->status = STARPU_TASK_RUNNING;	
+
+	if (rank == 0) {
+		cl->per_worker_stats[workerid]++;
+
+		profiling_info = task->profiling_info;
+	
+		if ((profiling && profiling_info) || calibrate_model || starpu_top)
+		{
+			starpu_clock_gettime(codelet_start);
+			_starpu_worker_register_executing_start_date(workerid, codelet_start);
+		}
+	}
+
+	if (starpu_top)
+		starputop_task_started(task,workerid,codelet_start);
+
+	STARPU_TRACE_START_CODELET_BODY(j);
+}
+
+void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_end, int rank)
+{
+	struct starpu_task *task = j->task;
+	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
+	int profiling = starpu_profiling_status_get();
+	int starpu_top=starpu_top_status_get();
+	int workerid = args->workerid;
+	unsigned calibrate_model = 0;
+	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
+
+	STARPU_TRACE_END_CODELET_BODY(j, archtype);
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	if (rank == 0) {
+		if ((profiling && profiling_info) || calibrate_model || starpu_top)
+			starpu_clock_gettime(codelet_end);
+	}
+
+	if (starpu_top)
+	  starputop_task_ended(task,workerid,codelet_end);
+
+	args->status = STATUS_UNKNOWN;
+}
 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
-					struct starpu_task_profiling_info *profiling_info,
 					enum starpu_perf_archtype perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end)
 {
+	struct starpu_task_profiling_info *profiling_info = j->task->profiling_info;
 	struct timespec measured_ts;
 	double measured;
 	int workerid = worker_args->workerid;
@@ -36,7 +97,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 	int profiling = starpu_profiling_status_get();
 	int updated = 0;
 
-	if (cl->model && cl->model->benchmarking)
+	if (cl->model && _starpu_get_calibrate_flag())
 		calibrate_model = 1;
 
 	if (profiling_info || calibrate_model)
@@ -61,8 +122,9 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 		if (calibrate_model)
 			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured);
 	}
+
 	if (!updated)
-		_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
+		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
 		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed);

+ 4 - 3
src/drivers/driver_common/driver_common.h

@@ -20,13 +20,14 @@
 
 #include <sys/time.h>
 #include <starpu.h>
-#include <starpu_profiling.h>
 #include <core/jobs.h>
-#include <profiling/profiling.h>
 #include <common/utils.h>
 
+void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j,
+		struct timespec *codelet_start, int rank);
+void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j,
+		struct timespec *codelet_end, int rank);
 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
-		struct starpu_task_profiling_info *profiling_info,
 		enum starpu_perf_archtype perf_arch,
 		struct timespec *codelet_start, struct timespec *codelet_end);
 

+ 5 - 3
src/drivers/gordon/driver_gordon.c

@@ -211,7 +211,7 @@ static void gordon_callback_list_func(void *arg)
 		}
 
 		_starpu_push_task_output(j->task, 0);
-		_starpu_handle_job_termination(j, 0, worker->sched_ctx);
+		_starpu_handle_job_termination(j, 0);
 		//starpu_wake_all_blocked_workers();
 
 		task_cnt++;
@@ -337,7 +337,9 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 		else {
 #ifndef NOCHAIN
 			int ret = 0;
+#ifdef STARPU_DEVEL
 #warning we should look into the local job list here !
+#endif
 
 			struct starpu_job_list_s *list = _starpu_pop_every_task();
 			/* XXX 0 is hardcoded */
@@ -390,7 +392,7 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 #else
 			/* gordon should accept a little more work */
 			starpu_job_t j;
-			j =  _starpu_pop_task(arg->current_sched_ctx);
+			j =  _starpu_pop_task(arg);
 	//		_STARPU_DEBUG("pop task %p\n", j);
 			if (j) {
 				if (STARPU_GORDON_MAY_PERFORM(j)) {
@@ -399,7 +401,7 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 					inject_task(j, &arg->workers[0]);
 				}
 				else {
-				  _starpu_push_task(j, 0, arg->current_sched_ctx);
+					_starpu_push_task(j, 0);
 				}
 			}
 #endif

+ 62 - 63
src/drivers/opencl/driver_opencl.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,6 @@
 
 #include <math.h>
 #include <starpu.h>
-#include <starpu_profiling.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <core/debug.h>
@@ -27,13 +26,13 @@
 #include "driver_opencl.h"
 #include "driver_opencl_utils.h"
 #include <common/utils.h>
-#include <profiling/profiling.h>
 
 static pthread_mutex_t big_lock = PTHREAD_MUTEX_INITIALIZER;
 
 static cl_context contexts[STARPU_MAXOPENCLDEVS];
 static cl_device_id devices[STARPU_MAXOPENCLDEVS];
 static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
+static cl_command_queue transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_uint nb_devices = -1;
 static int init_done = 0;
 extern char *_starpu_opencl_program_dir;
@@ -122,9 +121,17 @@ cl_int _starpu_opencl_init_context(int devid)
         contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-        // Create queue for the given device
+        // Create execution queue for the given device
         queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+        // Create transfer queue for the given device
+        cl_command_queue_properties props;
+        clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
+        props &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
 	PTHREAD_MUTEX_UNLOCK(&big_lock);
 
 	limit_gpu_mem_if_needed(devid);
@@ -148,6 +155,9 @@ cl_int _starpu_opencl_deinit_context(int devid)
         err = clReleaseCommandQueue(queues[devid]);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
+        err = clReleaseCommandQueue(transfer_queues[devid]);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
         contexts[devid] = NULL;
 
 	PTHREAD_MUTEX_UNLOCK(&big_lock);
@@ -176,7 +186,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
         if (STARPU_LIKELY(err == CL_SUCCESS)) {
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
@@ -184,7 +194,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, si
         else {
                 if (event != NULL) {
                         /* The asynchronous copy has failed, try to copy synchronously */
-                        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+                        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
                 }
                 if (STARPU_LIKELY(err == CL_SUCCESS)) {
                         *ret = 0;
@@ -204,7 +214,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size,
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
@@ -217,7 +227,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueReadBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
         if (STARPU_LIKELY(err == CL_SUCCESS)) {
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
@@ -225,7 +235,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, si
         else {
                 if (event != NULL)
                         /* The asynchronous copy has failed, try to copy synchronously */
-                        err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
+                        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
                 if (STARPU_LIKELY(err == CL_SUCCESS)) {
                         *ret = 0;
                         return CL_SUCCESS;
@@ -246,7 +256,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size,
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueReadBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
@@ -262,7 +272,7 @@ cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueReadBufferRect(queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
+        err = clEnqueueReadBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -278,7 +288,7 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
-        err = clEnqueueWriteBufferRect(queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
+        err = clEnqueueWriteBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -300,7 +310,7 @@ void _starpu_opencl_init(void)
 
                 // Get Platforms
                 err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
-                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if (err != CL_SUCCESS) nb_platforms=0;
                 _STARPU_DEBUG("Platforms detected: %d\n", nb_platforms);
 
                 // Get devices
@@ -308,28 +318,40 @@ void _starpu_opencl_init(void)
                 {
                         for (i=0; i<nb_platforms; i++) {
                                 cl_uint num;
-
+				int platform_valid = 1;
+				char name[1024], vendor[1024];
+
+				err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
+				if (err != CL_SUCCESS) {
+					STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
+					platform_valid = 0;
+				}
+				else {
+					err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
+					if (err != CL_SUCCESS) {
+						STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
+						platform_valid = 0;
+					}
+				}
 #ifdef STARPU_VERBOSE
-                                {
-                                        char name[1024], vendor[1024];
-                                        err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                                        err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                                        _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
-                                }
+				if (platform_valid)
+					_STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
+				else
+					_STARPU_DEBUG("Platform invalid\n");
 #endif
-                                err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
-                                if (err == CL_DEVICE_NOT_FOUND) {
-                                        _STARPU_DEBUG("  No devices detected on this platform\n");
-                                }
-                                else {
-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                                        _STARPU_DEBUG("  %d devices detected\n", num);
-                                        nb_devices += num;
-                                }
-                        }
-                }
+				if (platform_valid) {
+					err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
+					if (err == CL_DEVICE_NOT_FOUND) {
+						_STARPU_DEBUG("  No devices detected on this platform\n");
+					}
+					else {
+						if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+						_STARPU_DEBUG("  %d devices detected\n", num);
+						nb_devices += num;
+					}
+				}
+			}
+		}
 
                 // Get location of OpenCl kernel source files
                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
@@ -338,6 +360,7 @@ void _starpu_opencl_init(void)
                 for(i=0 ; i<nb_devices ; i++) {
                         contexts[i] = NULL;
                         queues[i] = NULL;
+                        transfer_queues[i] = NULL;
                 }
 
                 init_done=1;
@@ -404,7 +427,7 @@ void *_starpu_opencl_worker(void *arg)
 
 		task = _starpu_pop_task(args);
 		
-                if (task == NULL) 
+        if (task == NULL) 
 		{
 			if (_starpu_worker_can_block(memnode))
 				_starpu_block_worker(workerid, args->sched_cond, args->sched_mutex);
@@ -491,16 +514,11 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	struct timespec codelet_start, codelet_end;
 
-	unsigned calibrate_model = 0;
 	int workerid = args->workerid;
-
 	STARPU_ASSERT(task);
 	struct starpu_codelet_t *cl = task->cl;
 	STARPU_ASSERT(cl);
 
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
 	ret = _starpu_fetch_task_input(task, mask);
 	if (ret != 0) {
 		/* there was not enough memory, so the input of
@@ -509,37 +527,18 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 		return -EAGAIN;
 	}
 
-	STARPU_TRACE_START_CODELET_BODY(j);
-
-	struct starpu_task_profiling_info *profiling_info;
-	int profiling = starpu_profiling_status_get();
-	profiling_info = task->profiling_info;
-
-	if ((profiling && profiling_info) || calibrate_model)
-	{
-		starpu_clock_gettime(&codelet_start);
-		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
-	}
-
-	args->status = STATUS_EXECUTING;
-	task->status = STARPU_TASK_RUNNING;	
+	_starpu_driver_start_job(args, j, &codelet_start, 0);
 
 	cl_func func = cl->opencl_func;
 	STARPU_ASSERT(func);
-	func(task->interface, task->cl_arg);
+	func(task->interfaces, task->cl_arg);
 
-	cl->per_worker_stats[workerid]++;
+	_starpu_driver_end_job(args, j, &codelet_end, 0);
 
-	if ((profiling && profiling_info) || calibrate_model)
-		starpu_clock_gettime(&codelet_end);
-
-	STARPU_TRACE_END_CODELET_BODY(j);
-	args->status = STATUS_UNKNOWN;
+	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
+							&codelet_start, &codelet_end);
 
 	_starpu_push_task_output(task, mask);
 
-	_starpu_driver_update_job_feedback(j, args, profiling_info, args->perf_arch,
-							&codelet_start, &codelet_end);
-
 	return EXIT_SUCCESS;
 }

+ 175 - 6
src/drivers/opencl/driver_opencl_utils.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,6 +29,10 @@
 #include "driver_opencl_utils.h"
 #include "driver_opencl.h"
 
+#ifdef HAVE_CL_CL_EXT_H
+#include <CL/cl_ext.h>
+#endif
+
 char *_starpu_opencl_program_dir;
 
 #define _STARPU_STRINGIFY_(x) #x
@@ -121,7 +125,8 @@ char *_starpu_opencl_load_program_source(const char *filename)
         return source;
 }
 
-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs)
+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs,
+					  const char* build_options)
 {
         unsigned int dev;
         unsigned int nb_devices;
@@ -150,7 +155,7 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
                 if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
                 // Build the program executable
-                err = clBuildProgram(program, 1, &device, "-Werror -cl-mad-enable", NULL, NULL);
+                err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);
                 if (err != CL_SUCCESS) {
                         size_t len;
                         static char buffer[4096];
@@ -168,10 +173,16 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
         return EXIT_SUCCESS;
 }
 
-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs)
+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs,
+					const char* build_options)
 {
+	int nb_devices;
         char located_file_name[1024];
 
+	// Do not try to load and compile the file if there is no devices
+	nb_devices = _starpu_opencl_get_device_count();
+	if (nb_devices == 0) return EXIT_SUCCESS;
+
         // Locate source file
         _starpu_opencl_locate_file(source_file_name, located_file_name);
         _STARPU_DEBUG("Source file name : <%s>\n", located_file_name);
@@ -181,7 +192,7 @@ int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct sta
         if(!opencl_program_source)
                 _STARPU_ERROR("Failed to load compute program from file <%s>!\n", located_file_name);
 
-        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs);
+        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, build_options);
 }
 
 cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
@@ -198,7 +209,7 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
         return CL_SUCCESS;
 }
 
-int starpu_opencl_collect_stats(cl_event event __attribute__((unused)))
+int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
 {
 #if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED)
 	struct starpu_task *task = starpu_get_current_task();
@@ -243,3 +254,161 @@ int starpu_opencl_collect_stats(cl_event event __attribute__((unused)))
 
 	return 0;
 }
+
+void starpu_opencl_display_error(const char *func, const char* msg, cl_int status)
+{
+	const char *errormsg;
+	switch (status) {
+	case CL_SUCCESS:
+		errormsg = "success";
+		break;
+	case CL_DEVICE_NOT_FOUND:
+		errormsg = "Device not found";
+		break;
+	case CL_DEVICE_NOT_AVAILABLE:
+		errormsg = "Device not available";
+		break;
+	case CL_COMPILER_NOT_AVAILABLE:
+		errormsg = "Compiler not available";
+		break;
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+		errormsg = "Memory object allocation failure";
+		break;
+	case CL_OUT_OF_RESOURCES:
+		errormsg = "Out of resources";
+		break;
+	case CL_OUT_OF_HOST_MEMORY:
+		errormsg = "Out of host memory";
+		break;
+	case CL_PROFILING_INFO_NOT_AVAILABLE:
+		errormsg = "Profiling info not available";
+		break;
+	case CL_MEM_COPY_OVERLAP:
+		errormsg = "Memory copy overlap";
+		break;
+	case CL_IMAGE_FORMAT_MISMATCH:
+		errormsg = "Image format mismatch";
+		break;
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+		errormsg = "Image format not supported";
+		break;
+	case CL_BUILD_PROGRAM_FAILURE:
+		errormsg = "Build program failure";
+		break;
+	case CL_MAP_FAILURE:
+		errormsg = "Map failure";
+		break;
+	case CL_INVALID_VALUE:
+		errormsg = "Invalid value";
+		break;
+	case CL_INVALID_DEVICE_TYPE:
+		errormsg = "Invalid device type";
+		break;
+	case CL_INVALID_PLATFORM:
+		errormsg = "Invalid platform";
+		break;
+	case CL_INVALID_DEVICE:
+		errormsg = "Invalid device";
+		break;
+	case CL_INVALID_CONTEXT:
+		errormsg = "Invalid context";
+		break;
+	case CL_INVALID_QUEUE_PROPERTIES:
+		errormsg = "Invalid queue properties";
+		break;
+	case CL_INVALID_COMMAND_QUEUE:
+		errormsg = "Invalid command queue";
+		break;
+	case CL_INVALID_HOST_PTR:
+		errormsg = "Invalid host pointer";
+		break;
+	case CL_INVALID_MEM_OBJECT:
+		errormsg = "Invalid memory object";
+		break;
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+		errormsg = "Invalid image format descriptor";
+		break;
+	case CL_INVALID_IMAGE_SIZE:
+		errormsg = "Invalid image size";
+		break;
+	case CL_INVALID_SAMPLER:
+		errormsg = "Invalid sampler";
+		break;
+	case CL_INVALID_BINARY:
+		errormsg = "Invalid binary";
+		break;
+	case CL_INVALID_BUILD_OPTIONS:
+		errormsg = "Invalid build options";
+		break;
+	case CL_INVALID_PROGRAM:
+		errormsg = "Invalid program";
+		break;
+	case CL_INVALID_PROGRAM_EXECUTABLE:
+		errormsg = "Invalid program executable";
+		break;
+	case CL_INVALID_KERNEL_NAME:
+		errormsg = "Invalid kernel name";
+		break;
+	case CL_INVALID_KERNEL_DEFINITION:
+		errormsg = "Invalid kernel definition";
+		break;
+	case CL_INVALID_KERNEL:
+		errormsg = "Invalid kernel";
+		break;
+	case CL_INVALID_ARG_INDEX:
+		errormsg = "Invalid argument index";
+		break;
+	case CL_INVALID_ARG_VALUE:
+		errormsg = "Invalid argument value";
+		break;
+	case CL_INVALID_ARG_SIZE:
+		errormsg = "Invalid argument size";
+		break;
+	case CL_INVALID_KERNEL_ARGS:
+		errormsg = "Invalid kernel arguments";
+		break;
+	case CL_INVALID_WORK_DIMENSION:
+		errormsg = "Invalid work dimension";
+		break;
+	case CL_INVALID_WORK_GROUP_SIZE:
+		errormsg = "Invalid work group size";
+		break;
+	case CL_INVALID_WORK_ITEM_SIZE:
+		errormsg = "Invalid work item size";
+		break;
+	case CL_INVALID_GLOBAL_OFFSET:
+		errormsg = "Invalid global offset";
+		break;
+	case CL_INVALID_EVENT_WAIT_LIST:
+		errormsg = "Invalid event wait list";
+		break;
+	case CL_INVALID_EVENT:
+		errormsg = "Invalid event";
+		break;
+	case CL_INVALID_OPERATION:
+		errormsg = "Invalid operation";
+		break;
+	case CL_INVALID_GL_OBJECT:
+		errormsg = "Invalid GL object";
+		break;
+	case CL_INVALID_BUFFER_SIZE:
+		errormsg = "Invalid buffer size";
+		break;
+	case CL_INVALID_MIP_LEVEL:
+		errormsg = "Invalid MIP level";
+		break;
+#ifdef CL_PLATFORM_NOT_FOUND_KHR
+	case CL_PLATFORM_NOT_FOUND_KHR:
+		errormsg = "Platform not found";
+		break;
+#endif
+	default:
+		errormsg = "unknown error";
+		break;
+	}
+	if (msg)
+		printf("oops in %s (%s) ... <%s> (%d) \n", func, msg, errormsg, status);
+	else
+		printf("oops in %s ... <%s> (%d) \n", func, errormsg, status);
+
+}

+ 6 - 10
src/profiling/bound.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -171,16 +171,13 @@ static void new_task(starpu_job_t j)
 	if (j->bound_task)
 		return;
 
-	if (STARPU_UNLIKELY(!j->footprint_is_computed))
-		_starpu_compute_buffers_footprint(j);
-
 	t = malloc(sizeof(*t));
 	memset(t, 0, sizeof(*t));
 	t->id = j->job_id;
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
-	t->footprint = j->footprint;
+	t->footprint = _starpu_compute_buffers_footprint(j);
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->depsn = 0;
@@ -209,8 +206,7 @@ void _starpu_bound_record(starpu_job_t j)
 	} else {
 		struct bound_task_pool *tp;
 
-		if (STARPU_UNLIKELY(!j->footprint_is_computed))
-			_starpu_compute_buffers_footprint(j);
+		_starpu_compute_buffers_footprint(j);
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;
@@ -756,7 +752,7 @@ static glp_prob *_starpu_bound_glp_resolve(int integer)
 		for (w = 0; w < nw; w++)
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
 				char name[32];
-				snprintf(name, sizeof(name), "w%ut%un", w, t);
+				snprintf(name, sizeof(name), "w%dt%dn", w, t);
 				glp_set_col_name(lp, colnum(w, t), name);
 				if (integer)
 					glp_set_col_kind(lp, colnum(w, t), GLP_IV);
@@ -857,9 +853,9 @@ void starpu_bound_print(FILE *output, int integer __attribute__ ((unused))) {
 			fprintf(output, "%s key %x\n", tp->cl->model->symbol, (unsigned) tp->footprint);
 			for (w = 0; w < nw; w++)
 				if (integer)
-					fprintf(output, "\tw%ut%un %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
+					fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
 				else
-					fprintf(output, "\tw%ut%un %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
+					fprintf(output, "\tw%dt%dn %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
 			fprintf(output, "\n");
 		}
 

+ 4 - 3
src/profiling/profiling.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -105,7 +105,7 @@ void _starpu_profiling_init(void)
 		profiling = 1;
 }
 
-void starpu_profiling_terminate(void)
+void _starpu_profiling_terminate(void)
 {
 
 }
@@ -236,7 +236,8 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 	{
 		PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 
-		starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
+		if (executing_time)
+			starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
 
 		worker_info[workerid].used_cycles += used_cycles;
 		worker_info[workerid].stall_cycles += stall_cycles;

+ 3 - 1
src/profiling/profiling.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -68,4 +68,6 @@ void _starpu_profiling_set_task_push_end_time(struct starpu_task *task);
 /* This function needs to be called before other starpu_profile_* functions */
 void _starpu_profiling_init(void);
 
+void _starpu_profiling_terminate(void);
+
 #endif // __PROFILING_H__

+ 15 - 2
src/profiling/profiling_helpers.c

@@ -55,6 +55,7 @@ void starpu_worker_profiling_helper_display_summary(void)
 	int profiling = starpu_profiling_status_get();
 	fprintf(stderr, "\nWorker statistics:\n");
 	fprintf(stderr,   "******************\n");
+	double overall_time = 0;
 
 	int workerid;
 	int worker_cnt = starpu_worker_get_count();
@@ -70,20 +71,32 @@ void starpu_worker_profiling_helper_display_summary(void)
 			double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.;
 			double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.;
 			double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.;
+			if (total_time > overall_time)
+				overall_time = total_time;
 
 			fprintf(stderr, "%-32s\n", name);
 			fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n", info.executed_tasks, total_time, executing_time, sleeping_time);
 			if (info.used_cycles || info.stall_cycles)
 				fprintf(stderr, "\t%lu Mcy %lu Mcy stall\n", info.used_cycles/1000000, info.stall_cycles/1000000);
 			if (info.power_consumed)
-				fprintf(stderr, "\t%lf J consumed\n", info.power_consumed);
+				fprintf(stderr, "\t%f J consumed\n", info.power_consumed);
 		} else {
-			fprintf(stderr, "\t%-32s\tapproximately %d task(s)\n", name, info.executed_tasks);
+			fprintf(stderr, "\t%-32s\t%d task(s)\n", name, info.executed_tasks);
 		}
 
 		sum_consumed += info.power_consumed;
 	}
 
+	if (profiling) {
+		const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
+		if (strval_idle_power) {
+			double idle_power = atof(strval_idle_power); /* Watt */
+			double idle_consumption = idle_power * overall_time / 1000.; /* J */
+
+			fprintf(stderr, "Idle consumption: %.2lf J\n", idle_consumption);
+			sum_consumed += idle_consumption;
+		}
+	}
 	if (profiling && sum_consumed)
 		fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed);
 }

+ 15 - 35
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -214,6 +214,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 	return new_list;
 }
 
+static
 int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
 {
 	struct starpu_task_list *list = &fifo_queue->taskq;
@@ -301,17 +302,12 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	if (starpu_get_prefetch_flag())
 		starpu_prefetch_task_input_on_node(task, memory_node);
 
-	switch (prio) {
-		case 1:
-			return _starpu_fifo_push_prio_task(dt->queue_array[best_workerid_ctx],
-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
-		case 2:
-			return _starpu_fifo_push_sorted_task(dt->queue_array[best_workerid_ctx],
-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
-		default:
-			return _starpu_fifo_push_task(dt->queue_array[best_workerid_ctx],
-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
-	}
+	if (prio)
+		return _starpu_fifo_push_sorted_task(dt->queue_array[best_workerid_ctx],
+			sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
+	else
+		return _starpu_fifo_push_task(dt->queue_array[best_workerid_ctx],
+			sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
 }
 
 static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_sched_ctx *sched_ctx)
@@ -335,7 +331,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 	unsigned nworkers = sched_ctx->nworkers_in_ctx;
 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
 	{
-                worker = sched_ctx->workerid[worker_in_ctx];
+        worker = sched_ctx->workerid[worker_in_ctx];
 		double exp_end;
 		
 		fifo = dt->queue_array[worker_in_ctx];
@@ -421,7 +417,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
 	double best_exp_end = 10e240;
 	double model_best = 0.0;
-	double penality_best = 0.0;
+	//double penality_best = 0.0;
 
 	int ntasks_best = -1;
 	double ntasks_best_end = 0.0;
@@ -432,7 +428,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
 	for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
 	{
-                worker = sched_ctx->workerid[worker_in_ctx];
+        worker = sched_ctx->workerid[worker_in_ctx];
 
 		fifo = dt->queue_array[worker_in_ctx];
 
@@ -528,7 +524,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 				best = worker;
 				best_in_ctx = worker_in_ctx;
 
-	//			_STARPU_DEBUG("best fitness (worker %d) %le = alpha*(%le) + beta(%le) +gamma(%le)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
+	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
 			}
 		}
 	}
@@ -542,12 +538,12 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 		 * so we force this measurement */
 		best = forced_best;
 		model_best = 0.0;
-		penality_best = 0.0;
+		//penality_best = 0.0;
 	}
 	else 
 	{
-		model_best = local_task_length[best_in_ctx];
-		penality_best = local_data_penalty[best_in_ctx];
+		model_best = local_task_length[best];
+		//penality_best = local_data_penalty[best];
 	}
 
 	/* we should now have the best worker in variable "best" */
@@ -560,12 +556,6 @@ static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id
 	return _dmda_push_task(task, 2, sched_ctx);
 }
 
-static int dm_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
-{
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-	return _dm_push_task(task, 1, sched_ctx);
-}
-
 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
@@ -575,12 +565,6 @@ static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 	return _dm_push_task(task, 0, sched_ctx);
 }
 
-static int dmda_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
-{
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-	return _dmda_push_task(task, 1, sched_ctx);
-}
-
 static int dmda_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
@@ -690,7 +674,6 @@ struct starpu_sched_policy_s _starpu_sched_dm_policy = {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.push_task = dm_push_task, 
-	.push_prio_task = dm_push_prio_task,
 	.pop_task = dmda_pop_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
@@ -703,7 +686,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.push_task = dmda_push_task, 
-	.push_prio_task = dmda_push_prio_task, 
 	.pop_task = dmda_pop_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
@@ -716,7 +698,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
 	.init_sched = initialize_dmda_sorted_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.push_task = dmda_push_sorted_task, 
-	.push_prio_task = dmda_push_sorted_task, 
 	.pop_task = dmda_pop_ready_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
@@ -729,7 +710,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy = {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.push_task = dmda_push_task, 
-	.push_prio_task = dmda_push_prio_task, 
 	.pop_task = dmda_pop_ready_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,

+ 39 - 5
src/sched_policies/detect_combined_workers.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,8 +14,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu.h>
 #include <common/config.h>
+#include <starpu.h>
 #include <common/utils.h>
 #include <core/workers.h>
 
@@ -83,7 +83,10 @@ static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array,
 	}
 	
 	/* If there is at least 2 children that are valid, we combined them. */
-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0)
+	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
+	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+
+	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
 		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
 
 	return (cpu_children_cnt == obj->arity);
@@ -101,7 +104,9 @@ static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topol
 	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
 	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
 }
+
 #else
+
 static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
 {
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
@@ -140,11 +145,40 @@ static void find_combinations_without_hwloc(struct starpu_machine_topology_s *to
 }
 #endif
 
+static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
+{
+	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+	int cpu_workers[STARPU_NMAXWORKERS];
+	unsigned ncpus = 0;
+
+	unsigned i;
+	for (i = 0; i < topology->nworkers; i++)
+	{
+		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
+			cpu_workers[ncpus++] = i;
+	}
+
+	if (ncpus > 0)
+	{
+		int ret;
+		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
+		STARPU_ASSERT(ret >= 0);
+	}
+}
+
 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
 {
+	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+	if (config->user_conf && config->user_conf->single_combined_worker)
+		combine_all_cpu_workers(topology);
+	else {
 #ifdef STARPU_HAVE_HWLOC
-	find_combinations_with_hwloc(topology);
+		find_combinations_with_hwloc(topology);
+		//find_combinations_without_hwloc(topology);
 #else
-	find_combinations_without_hwloc(topology);
+		find_combinations_without_hwloc(topology);
 #endif
+	}
 }

+ 1 - 25
src/sched_policies/eager_central_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -97,13 +97,6 @@ static int push_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_i
 	return _starpu_fifo_push_task(fifo, sched_ctx->sched_mutex[0], sched_ctx->sched_cond[0], task);
 }
 
-static int push_prio_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_id)
-{
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-	struct starpu_fifo_taskq_s *fifo = (struct starpu_fifo_taskq_s*)sched_ctx->policy_data;
-	return _starpu_fifo_push_prio_task(fifo, sched_ctx->sched_mutex[0], sched_ctx->sched_cond[0], task);
-}
-
 static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
@@ -136,26 +129,9 @@ struct starpu_sched_policy_s _starpu_sched_eager_policy = {
 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
 	.deinit_sched = deinitialize_eager_center_policy,
 	.push_task = push_task_eager_policy,
-	.push_task_notify = NULL,
-	.push_prio_task = push_prio_task_eager_policy,
 	.pop_task = pop_task_eager_policy,
 	.post_exec_hook = NULL,
 	.pop_every_task = pop_every_task_eager_policy,
 	.policy_name = "eager",
 	.policy_description = "greedy policy"
 };
-
-struct starpu_sched_policy_s _starpu_sched_no_prio_policy = {
-	.init_sched = initialize_eager_center_policy,
-	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
-	.deinit_sched = deinitialize_eager_center_policy,
-	.push_task = push_task_eager_policy,
-	.push_task_notify = NULL,
-	/* we use the same method in spite of the priority */
-	.push_prio_task = push_task_eager_policy,
-	.pop_task = pop_task_eager_policy,
-	.post_exec_hook = NULL,
-	.pop_every_task = pop_every_task_eager_policy,
-	.policy_name = "no-prio",
-	.policy_description = "eager without priority"
-};

+ 1 - 2
src/sched_policies/eager_central_priority_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -200,7 +200,6 @@ struct starpu_sched_policy_s _starpu_sched_prio_policy = {
 	.deinit_sched = deinitialize_eager_center_priority_policy,
 	/* we always use priorities in that policy */
 	.push_task = _starpu_priority_push_task,
-	.push_prio_task = _starpu_priority_push_task,
 	.pop_task = _starpu_priority_pop_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,

+ 4 - 15
src/sched_policies/fifo_queues.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -46,26 +46,14 @@ void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo)
 	free(fifo);
 }
 
-int _starpu_fifo_push_prio_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
-{
-	PTHREAD_MUTEX_LOCK(sched_mutex);
-
-	STARPU_TRACE_JOB_PUSH(task, 0);
-	starpu_task_list_push_back(&fifo_queue->taskq, task);
-	fifo_queue->ntasks++;
-	fifo_queue->nprocessed++;
-
-	PTHREAD_COND_SIGNAL(sched_cond);
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
-
-	return 0;
-}
+/* TODO: revert front/back? */
 
 int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
 {
 	PTHREAD_MUTEX_LOCK(sched_mutex);
 
 	STARPU_TRACE_JOB_PUSH(task, 0);
+	/* TODO: if prio, put at back */
 	starpu_task_list_push_front(&fifo_queue->taskq, task);
 	fifo_queue->ntasks++;
 	fifo_queue->nprocessed++;
@@ -94,6 +82,7 @@ struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo_queue
 		
 		STARPU_TRACE_JOB_POP(task, 0);
 	}
+	
 	return task;
 }
 

+ 0 - 1
src/sched_policies/fifo_queues.h

@@ -42,7 +42,6 @@ struct starpu_fifo_taskq_s*_starpu_create_fifo(void);
 void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo);
 
 int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
-int _starpu_fifo_push_prio_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
 
 struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo, int workerid);
 struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, int workerid);

+ 39 - 23
src/sched_policies/heft.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +24,7 @@
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
 #include <starpu_task_bundle.h>
+#include <starpu_top.h>
 
 typedef struct {
 	double alpha;
@@ -37,6 +38,21 @@ double exp_end[STARPU_NMAXWORKERS];
 double exp_len[STARPU_NMAXWORKERS];
 double ntasks[STARPU_NMAXWORKERS];
 
+
+const float alpha_minimum=0;
+const float alpha_maximum=10.0;
+const float beta_minimum=0;
+const float beta_maximum=10.0;
+const float gamma_minimum=0;
+const float gamma_maximum=10000.0;
+const float idle_power_minimum=0;
+const float idle_power_maximum=10000.0;
+
+void param_modified(struct starputop_param_t* d){
+	//just to show parameter modification
+	fprintf(stderr,"%s has been modified : alpha=%f|beta=%f|gamma=%f|idle_power=%f !\n", 
+		d->name, alpha,beta,_gamma,idle_power);
+}
 static void heft_init_for_workers(unsigned sched_ctx_id, unsigned nnew_workers)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
@@ -103,6 +119,11 @@ static void heft_init(unsigned sched_ctx_id)
 	if (strval_idle_power)
 		hd->idle_power = atof(strval_idle_power);
 
+	starputop_register_parameter_float("HEFT_ALPHA", &hd->alpha, alpha_minimum,alpha_maximum,param_modified);
+	starputop_register_parameter_float("HEFT_BETA", &hd->beta, beta_minimum,beta_maximum,param_modified);
+	starputop_register_parameter_float("HEFT_GAMMA", &hd->_gamma, gamma_minimum,gamma_maximum,param_modified);
+	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
+
 	unsigned workerid_ctx;
 
 	for (workerid_ctx = 0; workerid_ctx < nworkers; workerid_ctx++)
@@ -181,28 +202,32 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	PTHREAD_MUTEX_LOCK(best_worker->sched_mutex);
 	exp_end[best_workerid] += predicted;
 	exp_len[best_workerid] += predicted;
-
 	ntasks[best_workerid]++;
 	PTHREAD_MUTEX_UNLOCK(best_worker->sched_mutex);
 
 	task->predicted = predicted;
 
+	if (starpu_top_status_get())
+		starputop_task_prevision(task, best_workerid, 
+					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
+					(unsigned long long)exp_end[best_workerid]/1000);
+
 	if (starpu_get_prefetch_flag())
 	{
 		unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
 		starpu_prefetch_task_input_on_node(task, memory_node);
 	}
-	
+
 	return starpu_push_local_task(best_workerid, task, prio);
 }
 
 static void compute_all_performance_predictions(struct starpu_task *task,
-						double *local_task_length, double *exp_end,
-						double *max_exp_endp, double *best_exp_endp,
-						double *local_data_penalty,
-						double *local_power, int *forced_best,
-						struct starpu_task_bundle *bundle,
-						struct starpu_sched_ctx *sched_ctx )
+					double *local_task_length, double *exp_end,
+					double *max_exp_endp, double *best_exp_endp,
+					double *local_data_penalty,
+					double *local_power, int *forced_best,
+					struct starpu_task_bundle *bundle,
+					struct starpu_sched_ctx *sched_ctx )
 {
   int calibrating = 0;
   double max_exp_end = DBL_MIN;
@@ -223,7 +248,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
       exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
       exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
       if (exp_end[worker_in_ctx] > max_exp_end)
- 	max_exp_end = exp_end[worker_in_ctx];
+ 		max_exp_end = exp_end[worker_in_ctx];
 
       if (!starpu_worker_may_execute_task(worker, task))
 	{
@@ -246,8 +271,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
       }
 
-      //      printf("%d: local task len = %2.2f perf model %d\n", worker, local_task_length[worker_in_ctx], task->cl->model->type);
-
       double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
 
       if (ntasks_best == -1
@@ -318,9 +341,9 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	struct starpu_task_bundle *bundle = task->bundle;
 
 	compute_all_performance_predictions(task, local_task_length, exp_end,
-					    &max_exp_end, &best_exp_end,
-					    local_data_penalty,
-					    local_power, &forced_best, bundle, sched_ctx);
+					&max_exp_end, &best_exp_end,
+					local_data_penalty,
+					local_power, &forced_best, bundle, sched_ctx);
 
 	/* If there is no prediction available for that task with that arch we
 	 * want to speed-up calibration time so we force this measurement */
@@ -400,11 +423,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	return push_task_on_best_worker(task, best, model_best, prio);
 }
 
-static int heft_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
-{
-        return _heft_push_task(task, 1, sched_ctx_id);
-}
-
 static int heft_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	if (task->priority > 0)
@@ -424,13 +442,11 @@ struct starpu_sched_policy_s heft_policy = {
 	.init_sched = heft_init,
 	.deinit_sched = heft_deinit,
 	.push_task = heft_push_task, 
-	.push_prio_task = heft_push_prio_task, 
 	.push_task_notify = heft_push_task_notify,
 	.pop_task = NULL,
 	.pop_every_task = NULL,
 	.post_exec_hook = heft_post_exec_hook,
 	.policy_name = "heft",
 	.policy_description = "Heterogeneous Earliest Finish Task",
-	.init_sched_for_workers = heft_init_for_workers
-	
+	.init_sched_for_workers = heft_init_for_workers	
 };

+ 2 - 2
src/sched_policies/parallel_greedy.c

@@ -16,6 +16,7 @@
 
 #include <core/workers.h>
 #include <sched_policies/fifo_queues.h>
+#include <common/barrier.h>
 
 /* the former is the actual queue, the latter some container */
 static struct starpu_fifo_taskq_s *fifo;
@@ -42,7 +43,7 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 	fifo = _starpu_create_fifo();
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-        struct starpu_machine_topology_s *topology = &config->topology;
+    struct starpu_machine_topology_s *topology = &config->topology;
 
 	_starpu_sched_find_worker_combinations(topology);
 
@@ -244,7 +245,6 @@ struct starpu_sched_policy_s _starpu_sched_pgreedy_policy = {
 	.init_sched = initialize_pgreedy_policy,
 	.deinit_sched = deinitialize_pgreedy_policy,
 	.push_task = push_task_pgreedy_policy,
-	.push_prio_task = push_task_pgreedy_policy,
 	.pop_task = pop_task_pgreedy_policy,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,

+ 129 - 190
src/sched_policies/parallel_heft.c

@@ -19,17 +19,15 @@
 #include <float.h>
 #include <limits.h>
 #include <core/workers.h>
-#include <sched_policies/fifo_queues.h>
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
+#include <common/barrier.h>
 
 static pthread_mutex_t big_lock;
 
 static unsigned nworkers, ncombinedworkers;
-static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
-static unsigned napplicable_perf_archtypes = 0;
-
-static struct starpu_fifo_taskq_s *queue_array[STARPU_NMAXWORKERS];
+//static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
+//static unsigned napplicable_perf_archtypes = 0;
 
 static pthread_cond_t sched_cond[STARPU_NMAXWORKERS];
 static pthread_mutex_t sched_mutex[STARPU_NMAXWORKERS];
@@ -39,25 +37,33 @@ static double beta = STARPU_DEFAULT_BETA;
 static double _gamma = STARPU_DEFAULT_GAMMA;
 static double idle_power = 0.0;
 
-static struct starpu_task *parallel_heft_pop_task(void)
+static double worker_exp_start[STARPU_NMAXWORKERS];
+static double worker_exp_end[STARPU_NMAXWORKERS];
+static double worker_exp_len[STARPU_NMAXWORKERS];
+static int ntasks[STARPU_NMAXWORKERS];
+
+static void parallel_heft_post_exec_hook(struct starpu_task *task)
 {
-	struct starpu_task *task;
+	if (!task->cl || task->execute_on_a_specific_worker)
+		return;
 
 	int workerid = starpu_worker_get_id();
-	struct starpu_fifo_taskq_s *fifo = queue_array[workerid];
-	task = _starpu_fifo_pop_task(fifo, -1);
-	if (task) {
-		double model = task->predicted;
+	double model = task->predicted;
 	
-		fifo->exp_len -= model;
-		fifo->exp_start = starpu_timing_now() + model;
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
-	}
-
-	return task;
+	if (model < 0.0)
+		model = 0.0;
+	
+	/* Once we have executed the task, we can update the predicted amount
+	 * of work. */
+	PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
+	worker_exp_len[workerid] -= model;
+	worker_exp_start[workerid] = starpu_timing_now();
+	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
+	ntasks[workerid]--;
+	PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
 }
 
-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio)
+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio)
 {
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
@@ -72,33 +78,20 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	if (starpu_get_prefetch_flag())
 		starpu_prefetch_task_input_on_node(task, memory_node);
 
+	int ret = 0;
+
+	PTHREAD_MUTEX_LOCK(&big_lock);
+
 	if (is_basic_worker)
 	{
-		PTHREAD_MUTEX_LOCK(&big_lock);
-
-		struct starpu_fifo_taskq_s *fifo;
-		fifo = queue_array[best_workerid];
-	
-		fifo->exp_end += predicted;
-		fifo->exp_len += predicted;
-	
-		task->predicted = predicted;
+		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
+		worker_exp_len[best_workerid] += exp_end_predicted - worker_exp_end[best_workerid];
+		worker_exp_end[best_workerid] = exp_end_predicted;
+		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
 	
-		int ret;
+		ntasks[best_workerid]++;
 
-		if (prio)
-		{
-			ret = _starpu_fifo_push_prio_task(queue_array[best_workerid],
-				&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
-		}
-		else {
-			ret = _starpu_fifo_push_task(queue_array[best_workerid],
-				&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
-		}
-
-		PTHREAD_MUTEX_UNLOCK(&big_lock);
-
-		return ret;
+		ret = starpu_push_local_task(best_workerid, task, prio);
 	}
 	else {
 		/* This is a combined worker so we create task aliases */
@@ -107,11 +100,6 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		int worker_size = combined_worker->worker_size;
 		int *combined_workerid = combined_worker->combined_workerid;
 
-		int ret = 0;
-		int i;
-		
-		task->predicted = predicted;
-
 		starpu_job_t j = _starpu_get_job_associated_to_task(task);
 		j->task_size = worker_size;
 		j->combined_workerid = best_workerid;
@@ -120,36 +108,28 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
 		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
-		PTHREAD_MUTEX_LOCK(&big_lock);
-
+		int i;
 		for (i = 0; i < worker_size; i++)
 		{
 			struct starpu_task *alias = _starpu_create_task_alias(task);
 			int local_worker = combined_workerid[i];
 
-			struct starpu_fifo_taskq_s *fifo;
-			fifo = queue_array[local_worker];
-		
-			fifo->exp_end += predicted;
-			fifo->exp_len += predicted;
-		
-			alias->predicted = predicted;
+			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
+	
+			worker_exp_len[local_worker] += exp_end_predicted - worker_exp_end[local_worker];
+			worker_exp_end[local_worker] = exp_end_predicted;
+			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
 		
-			if (prio)
-			{
-				ret |= _starpu_fifo_push_prio_task(queue_array[local_worker],
-					&sched_mutex[local_worker], &sched_cond[local_worker], alias);
-			}
-			else {
-				ret |= _starpu_fifo_push_task(queue_array[local_worker],
-					&sched_mutex[local_worker], &sched_cond[local_worker], alias);
-			}
+			ntasks[local_worker]++;
+	
+			ret |= starpu_push_local_task(local_worker, alias, prio);
 		}
 
-		PTHREAD_MUTEX_UNLOCK(&big_lock);
-
-		return ret;
 	}
+
+	PTHREAD_MUTEX_UNLOCK(&big_lock);
+
+	return ret;
 }
 
 static double compute_expected_end(int workerid, double length)
@@ -157,9 +137,7 @@ static double compute_expected_end(int workerid, double length)
 	if (workerid < (int)nworkers)
 	{
 		/* This is a basic worker */
-		struct starpu_fifo_taskq_s *fifo;
-		fifo = queue_array[workerid];
-		return (fifo->exp_start + fifo->exp_len + length);
+		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
 	}
 	else {
 		/* This is a combined worker, the expected end is the end for the latest worker */
@@ -172,9 +150,9 @@ static double compute_expected_end(int workerid, double length)
 		int i;
 		for (i = 0; i < worker_size; i++)
 		{
-			struct starpu_fifo_taskq_s *fifo;
-			fifo = queue_array[combined_workerid[i]];
-			double local_exp_end = (fifo->exp_start + fifo->exp_len + length);
+			double local_exp_start = worker_exp_start[combined_workerid[i]];
+			double local_exp_len = worker_exp_len[combined_workerid[i]];
+			double local_exp_end = local_exp_start + local_exp_len + length;
 			exp_end = STARPU_MAX(exp_end, local_exp_end);
 		}
 
@@ -184,41 +162,34 @@ static double compute_expected_end(int workerid, double length)
 
 static double compute_ntasks_end(int workerid)
 {
-  enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
-  if (workerid < (int)nworkers)
-    {
-      /* This is a basic worker */
-      struct starpu_fifo_taskq_s *fifo;
-      fifo = queue_array[workerid];
-      return fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
-    }
-  else {
-    /* This is a combined worker, the expected end is the end for the latest worker */
-    int worker_size;
-    int *combined_workerid;
-    starpu_combined_worker_get_description(workerid, &worker_size, &combined_workerid);
-
-    int ntasks_end;
-
-    int i;
-    for (i = 0; i < worker_size; i++)
-      {
-	struct starpu_fifo_taskq_s *fifo;
-	fifo = queue_array[combined_workerid[i]];
-	/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
-	ntasks_end = STARPU_MAX(ntasks_end, fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch));
-      }
-
-    return ntasks_end;
-  }
+	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	if (workerid < (int)nworkers)
+	{
+		/* This is a basic worker */
+		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
+	}
+	else {
+		/* This is a combined worker, the expected end is the end for the latest worker */
+		int worker_size;
+		int *combined_workerid;
+		starpu_combined_worker_get_description(workerid, &worker_size, &combined_workerid);
+
+		int ntasks_end=0;
+
+		int i;
+		for (i = 0; i < worker_size; i++)
+		{
+			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
+			ntasks_end = STARPU_MAX(ntasks_end, ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch));
+		}
+
+		return ntasks_end;
+	}
 }
 
-static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, struct starpu_sched_ctx *sched_ctx)
+static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 {
-	/* find the queue */
-	struct starpu_fifo_taskq_s *fifo;
-	unsigned worker, worker_in_ctx;
-
+	unsigned worker;
 	int best = -1;
 	
 	/* this flag is set if the corresponding worker is selected because
@@ -228,7 +199,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 	double local_task_length[nworkers+ncombinedworkers];
 	double local_data_penalty[nworkers+ncombinedworkers];
 	double local_power[nworkers+ncombinedworkers];
-	double exp_end[nworkers+ncombinedworkers];
+	double local_exp_end[nworkers+ncombinedworkers];
 	double fitness[nworkers+ncombinedworkers];
 
 	double max_exp_end = 0.0;
@@ -236,33 +207,26 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 	int skip_worker[nworkers+ncombinedworkers];
 
 	double best_exp_end = DBL_MAX;
-	double model_best = 0.0;
-	double penality_best = 0.0;
+	//double penality_best = 0.0;
 
 	int ntasks_best = -1;
 	double ntasks_best_end = 0.0;
 	int calibrating = 0;
 
-        /* A priori, we know all estimations */
+	/* A priori, we know all estimations */
 	int unknown = 0;
 
-	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
+	for (worker = 0; worker < nworkers; worker++)
 	{
-                worker = sched_ctx->workerid[worker_in_ctx];
-
-		fifo = queue_array[worker];
-
 		/* Sometimes workers didn't take the tasks as early as we expected */
-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
-		if (fifo->exp_end > max_exp_end)
-			max_exp_end = fifo->exp_end;
+		worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
+		worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
+		if (worker_exp_end[worker] > max_exp_end)
+			max_exp_end = worker_exp_end[worker];
 	}
 
-	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
+	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
 	{
-                worker = sched_ctx->workerid[worker_in_ctx];
-
 		if (!starpu_combined_worker_may_execute_task(worker, task))
 		{
 			/* no one on that queue may execute this task */
@@ -304,12 +268,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 		if (unknown)
 			continue;
 
-		exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
+		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
+
+		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
 
-		if (exp_end[worker] < best_exp_end)
+		if (local_exp_end[worker] < best_exp_end)
 		{
 			/* a better solution was found */
-			best_exp_end = exp_end[worker];
+			best_exp_end = local_exp_end[worker];
 		}
 
 		local_power[worker] = starpu_task_expected_power(task, perf_arch);
@@ -321,13 +287,12 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 		forced_best = ntasks_best;
 
 	double best_fitness = -1;
-	
+
+
 	if (forced_best == -1)
 	{
-
-	        for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
-	        {
-		        worker = sched_ctx->workerid[worker_in_ctx];
+		for (worker = 0; worker < nworkers+ncombinedworkers; worker++)
+		{
 
 			if (skip_worker[worker])
 			{
@@ -335,15 +300,15 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 				continue;
 			}
 	
-			fitness[worker] = alpha*(exp_end[worker] - best_exp_end) 
+			fitness[worker] = alpha*(local_exp_end[worker] - best_exp_end) 
 					+ beta*(local_data_penalty[worker])
 					+ _gamma*(local_power[worker]);
 
-			if (exp_end[worker] > max_exp_end)
+			if (local_exp_end[worker] > max_exp_end)
 				/* This placement will make the computation
 				 * longer, take into account the idle
 				 * consumption of other cpus */
-				fitness[worker] += _gamma * idle_power * (exp_end[worker] - max_exp_end) / 1000000.0;
+				fitness[worker] += _gamma * idle_power * (local_exp_end[worker] - max_exp_end) / 1000000.0;
 
 			if (best == -1 || fitness[worker] < best_fitness)
 			{
@@ -351,53 +316,44 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 				best_fitness = fitness[worker];
 				best = worker;
 			}
+
+		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
 		}
 	}
 
 	STARPU_ASSERT(forced_best != -1 || best != -1);
-	
+
 	if (forced_best != -1)
 	{
 		/* there is no prediction available for that task
 		 * with that arch we want to speed-up calibration time
 		 * so we force this measurement */
 		best = forced_best;
-		model_best = 0.0;
-		penality_best = 0.0;
+		//penality_best = 0.0;
+		best_exp_end = local_exp_end[best];
 	}
 	else 
 	{
-		model_best = local_task_length[best];
-		penality_best = local_data_penalty[best];
+                //penality_best = local_data_penalty[best];
+		best_exp_end = local_exp_end[best];
 	}
 
 	/* we should now have the best worker in variable "best" */
-	return push_task_on_best_worker(task, best, model_best, prio);
+	return push_task_on_best_worker(task, best, best_exp_end, prio);
 }
 
-static int parallel_heft_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
+static int parallel_heft_push_task(struct starpu_task *task)
 {
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-
-	return _parallel_heft_push_task(task, 1, sched_ctx);
-}
-
-static int parallel_heft_push_task(struct starpu_task *task, unsigned sched_ctx_id)
-{ 
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
 	if (task->priority == STARPU_MAX_PRIO)
-	  return _parallel_heft_push_task(task, 1, sched_ctx);
+		return _parallel_heft_push_task(task, 1);
 
-	return _parallel_heft_push_task(task, 0, sched_ctx);
+	return _parallel_heft_push_task(task, 0);
 }
 
-static void initialize_parallel_heft_policy(unsigned sched_ctx_id) 
+static void initialize_parallel_heft_policy(struct starpu_machine_topology_s *topology, 
+	 __attribute__ ((unused)) struct starpu_sched_policy_s *_policy) 
 {
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-
-	nworkers = sched_ctx->nworkers_in_ctx;
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	struct starpu_machine_topology_s *topology = &config->topology;
+	nworkers = topology->nworkers;
 
 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
 	if (strval_alpha)
@@ -419,11 +375,13 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
 	ncombinedworkers = topology->ncombinedworkers;
 
-	unsigned workerid, workerid_ctx;
-	for (workerid_ctx = 0; workerid_ctx < nworkers; workerid_ctx++)
+	unsigned workerid;
+	for (workerid = 0; workerid < nworkers; workerid++)
 	{
-                workerid = sched_ctx->workerid[workerid_ctx];
-		queue_array[workerid] = _starpu_create_fifo();
+		worker_exp_start[workerid] = starpu_timing_now();
+		worker_exp_len[workerid] = 0.0;
+		worker_exp_end[workerid] = worker_exp_start[workerid]; 
+		ntasks[workerid] = 0;
 	
 		PTHREAD_MUTEX_INIT(&sched_mutex[workerid], NULL);
 		PTHREAD_COND_INIT(&sched_cond[workerid], NULL);
@@ -435,52 +393,33 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
 	/* We pre-compute an array of all the perfmodel archs that are applicable */
 	unsigned total_worker_count = nworkers + ncombinedworkers;
-	printf("ncombinedworkers = %d\n", ncombinedworkers);
+
 	unsigned used_perf_archtypes[STARPU_NARCH_VARIATIONS];
 	memset(used_perf_archtypes, 0, sizeof(used_perf_archtypes));
 
-	int nworkers_machine = topology->nworkers;
-
-	for (workerid_ctx = 0; workerid_ctx < total_worker_count; workerid_ctx++)
+	for (workerid = 0; workerid < total_worker_count; workerid++)
 	{
-	  workerid = (unsigned)workerid_ctx >= nworkers ? (nworkers_machine + (unsigned)workerid_ctx - nworkers) : sched_ctx->workerid[workerid_ctx];
-	  printf("workerid = %d\n", workerid);
 		enum starpu_perf_archtype perf_archtype = starpu_worker_get_perf_archtype(workerid);
-		printf("perf_archtype = %d\n", perf_archtype);
 		used_perf_archtypes[perf_archtype] = 1;
 	}
 
-	napplicable_perf_archtypes = 0;
-
-	int arch;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-	{
-		if (used_perf_archtypes[arch])
-			applicable_perf_archtypes[napplicable_perf_archtypes++] = arch;
-	}
-}
+//	napplicable_perf_archtypes = 0;
 
-static void deinitialize_parallel_heft_policy(unsigned sched_ctx_id) 
-{
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-
-	unsigned workerid;
-	int workerid_in_ctx;
-        int nworkers = sched_ctx->nworkers_in_ctx;
-	for (workerid_in_ctx = 0; workerid_in_ctx < nworkers; workerid_in_ctx++){
-                workerid = sched_ctx->workerid[workerid_in_ctx];
-		_starpu_destroy_fifo(queue_array[workerid]);
-	}
+//	int arch;
+//	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
+//	{
+//		if (used_perf_archtypes[arch])
+//			applicable_perf_archtypes[napplicable_perf_archtypes++] = arch;
+//	}
 }
 
 /* TODO: use post_exec_hook to fix the expected start */
 struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy = {
 	.init_sched = initialize_parallel_heft_policy,
-	.deinit_sched = deinitialize_parallel_heft_policy,
+	.deinit_sched = NULL,
 	.push_task = parallel_heft_push_task, 
-	.push_prio_task = parallel_heft_push_prio_task, 
-	.pop_task = parallel_heft_pop_task,
-	.post_exec_hook = NULL,
+	.pop_task = NULL,
+	.post_exec_hook = parallel_heft_post_exec_hook,
 	.pop_every_task = NULL,
 	.policy_name = "pheft",
 	.policy_description = "parallel HEFT"

+ 2 - 10
src/sched_policies/random_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -65,18 +65,12 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 	return n;
 }
 
-static int random_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
-{	
-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-
-        return _random_push_task(task, 1, sched_ctx);
-}
 
 static int random_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
 
-        return _random_push_task(task, 0, sched_ctx);
+    return _random_push_task(task, 0, sched_ctx);
 }
 
 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, unsigned nnew_workers) 
@@ -129,8 +123,6 @@ struct starpu_sched_policy_s _starpu_sched_random_policy = {
 	.init_sched_for_workers = initialize_random_policy_for_workers,
 	.deinit_sched = NULL,
 	.push_task = random_push_task,
-	.push_prio_task = random_push_prio_task,
-	.push_task_notify = NULL,
 	.pop_task = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,

+ 5 - 16
src/sched_policies/stack_queues.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -57,27 +57,16 @@ unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack_queue)
 	return stack_queue->nprocessed;
 }
 
-void _starpu_stack_push_prio_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
-{
-	PTHREAD_MUTEX_LOCK(sched_mutex);
-	total_number_of_jobs++;
-
-	STARPU_TRACE_JOB_PUSH(task, 0);
-	starpu_job_list_push_back(stack_queue->jobq, task);
-	stack_queue->njobs++;
-	stack_queue->nprocessed++;
-
-	PTHREAD_COND_SIGNAL(sched_cond);
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
-}
-
 void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
 {
 	PTHREAD_MUTEX_LOCK(sched_mutex);
 	total_number_of_jobs++;
 
 	STARPU_TRACE_JOB_PUSH(task, 0);
-	starpu_job_list_push_front(stack_queue->jobq, task);
+	if (task->task->priority)
+		starpu_job_list_push_back(stack_queue->jobq, task);
+	else
+		starpu_job_list_push_front(stack_queue->jobq, task);
 	stack_queue->njobs++;
 	stack_queue->nprocessed++;
 

+ 0 - 1
src/sched_policies/stack_queues.h

@@ -42,7 +42,6 @@ struct starpu_stack_jobq_s {
 struct starpu_stack_jobq_s *_starpu_create_stack(void);
 
 void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
-void _starpu_stack_push_prio_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
 
 starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, int workerid);
 

+ 4 - 3
src/sched_policies/work_stealing_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -129,7 +129,9 @@ static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsign
 
 #endif
 
+#ifdef STARPU_DEVEL
 #warning TODO rewrite ... this will not scale at all now
+#endif
 static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
@@ -252,7 +254,6 @@ struct starpu_sched_policy_s _starpu_sched_ws_policy = {
 	.init_sched = initialize_ws_policy,
 	.deinit_sched = NULL,
 	.push_task = ws_push_task,
-	.push_prio_task = ws_push_task,
 	.pop_task = ws_pop_task,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,

+ 756 - 0
src/top/starpu_top.c

@@ -0,0 +1,756 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu_top.h>
+#include <top/starputop_message_queue.h>
+#include <top/starputop_connection.h>
+#include <profiling/profiling.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <pthread.h>
+#include <common/timing.h>
+
+extern starputop_message_queue_t*  starputop_mt;
+int starpu_top = 0;
+int starputop_debug_on = 0;
+unsigned int starputop_data_cpt = 0;
+unsigned int starputop_param_cpt = 0;
+starputop_data* starputop_first_data = NULL;
+starputop_param* starputop_first_param = NULL;
+starputop_data** starputop_datas;
+starputop_param** starputop_params;
+
+sem_t starputop_wait_for_go;
+pthread_mutex_t starputop_wait_for_continue_mutex;
+pthread_cond_t starputop_wait_for_continue_cond = PTHREAD_COND_INITIALIZER;
+
+int starpu_top_status_get()
+{
+  return starpu_top;
+}
+
+
+unsigned long long int current_timestamp();
+
+/*********************************************
+*****************INIT FUNC********************
+**********************************************/
+
+char *message_for_topdata_init(starputop_data* data);
+char *message_for_topparam_init(starputop_param* param);
+
+/*
+ * we store data and param in a tab to offer a O(1) access when the program  is
+ * running
+ */
+void copy_data_and_param()
+{
+	printf("%s:%d trace\n", __FILE__, __LINE__);
+	//copying datas
+	starputop_datas = malloc(starputop_data_cpt*sizeof(starputop_data*));
+	starputop_data* cur = starputop_first_data;
+	unsigned int i = 0;
+	for(i = 0; i < starputop_data_cpt; i++)
+	{
+		starputop_datas[i] = cur;
+		cur = cur->next;
+	}
+	//copying params
+	starputop_params = malloc(starputop_param_cpt*sizeof(starputop_param*));
+	starputop_param* cur2 = starputop_first_param;
+	for(i = 0; i < starputop_param_cpt; i++)
+	{
+		starputop_params[i] = cur2;
+		cur2 = cur2->next;
+	}
+}
+
+static void starputop_get_device_type(int id, char* type){
+	enum starpu_archtype device_type=starpu_worker_get_type(id);
+	switch (device_type)
+	{
+	case STARPU_CPU_WORKER:
+		strncpy(type, "CPU",9);
+		break;
+	case STARPU_CUDA_WORKER:
+		strncpy(type, "CUDA",9);
+		break;
+	case STARPU_OPENCL_WORKER:
+		strncpy(type, "OPENCL",9);
+		break;
+	case STARPU_GORDON_WORKER:
+		strncpy(type, "GORDON",9);
+		break;
+	}  
+}
+
+static void starputop_send_devices_info()
+{
+	char* message=malloc(5*sizeof(char));
+	snprintf(message,5,"DEV\n");
+	starputop_message_add(starputop_mt,message);
+
+	unsigned int i;
+	for(i=0;i<starpu_worker_get_count();i++)
+	{
+		message=malloc(sizeof(char)*128);
+		char dev_type[10];
+		char dev_name[64];
+		starputop_get_device_type(i,dev_type);
+		starpu_worker_get_name(i, dev_name,64);
+		snprintf(message, 128, "%d;%s;%s\n", i, dev_type, dev_name);
+		starputop_message_add(starputop_mt,message);    
+	}
+
+	message=malloc(6*sizeof(char));                             
+	snprintf(message,6,"/DEV\n");                
+	starputop_message_add(starputop_mt,message);  
+}
+
+
+void starputop_init_and_wait(const char* server_name){
+	starpu_top=1;
+	sem_init(&starputop_wait_for_go,0,0);
+	
+	pthread_mutex_init(&starputop_wait_for_continue_mutex, NULL);
+	
+	//profiling activation
+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
+
+	//init locked queue before adding the first message
+	starputop_mt = starputop_message_queue_new();
+
+	//waiting for UI to connect
+	printf("%s:%d launching network threads\n", __FILE__, __LINE__);
+	starputop_communications_threads_launcher();
+
+	//sending server information (report to protocol)
+	char* message = malloc(strlen("SERVERINFO\n")+1);
+	sprintf(message, "%s", "SERVERINFO\n");  
+	starputop_message_add(starputop_mt,message);
+	message = malloc(strlen(server_name)+2);
+	sprintf(message, "%s\n", server_name);
+	starputop_message_add(starputop_mt,message);
+	message = malloc(25);
+	sprintf(message, "%lld\n", current_timestamp());
+	starputop_message_add(starputop_mt,message);
+	message = malloc(strlen("/SERVERINFO\n")+1);
+	sprintf(message,"%s", "/SERVERINFO\n");
+	starputop_message_add(starputop_mt,message);
+
+
+	//sending data list
+	message = malloc(strlen("DATA\n")+1);
+	sprintf(message, "%s", "DATA\n");
+	starputop_message_add(starputop_mt,message);
+	starputop_data * cur_data = starputop_first_data;
+	while(cur_data != NULL)
+	{
+		starputop_message_add(starputop_mt,message_for_topdata_init(cur_data));
+		cur_data = cur_data->next;
+	}
+	message = malloc(strlen("/DATA\n")+1);
+	sprintf(message, "%s", "/DATA\n");
+	starputop_message_add(starputop_mt,message);
+	
+	//sending parameter list
+	message = malloc(strlen("PARAMS\n")+1);
+	sprintf(message, "%s", "PARAMS\n");
+	starputop_message_add(starputop_mt,message);
+	starputop_param * cur_param = starputop_first_param;
+	printf("%s:%d sending parameters\n", __FILE__, __LINE__);
+	while(cur_param != NULL){
+	  starputop_message_add(starputop_mt,message_for_topparam_init(cur_param));
+	  cur_param = cur_param->next;
+	}
+	printf("%s:%d parameters sended\n", __FILE__, __LINE__);
+	message = malloc(strlen("/PARAMS\n")+1);
+	sprintf(message, "%s", "/PARAMS\n");
+	starputop_message_add(starputop_mt,message);
+	
+	
+	//sending DEVICE list
+	printf("%s:%d sending devices info\n", __FILE__, __LINE__);
+	starputop_send_devices_info();
+	printf("%s:%d devices_info sended\n", __FILE__, __LINE__);
+	//copying data and params
+	copy_data_and_param();
+	
+	//sending READY message
+	message = malloc(strlen("READY\n")+1);
+	sprintf(message, "%s", "READY\n");
+	starputop_message_add(starputop_mt,message);
+	
+	//This threads keeps locked while we don't receive an GO message from UI
+	printf("%s:%d waiting for GO message\n", __FILE__, __LINE__);
+	sem_wait(&starputop_wait_for_go);
+}
+
+void starputop_enqueue_data(starputop_data * data)
+{
+	if(starputop_first_data == NULL)
+	{
+		starputop_first_data = data;
+	}
+	else
+	{
+		starputop_data * cur = starputop_first_data;
+		while(cur->next != NULL)
+			cur = cur->next;
+		cur->next = data;
+	}
+}
+
+starputop_data * starputop_add_data_boolean(
+			const char* data_name,
+			int active)
+{		
+	starputop_data * data = malloc(sizeof(starputop_data));
+	data->id = starputop_data_cpt++;
+	data->name = data_name;
+	data->type = STARPUTOP_DATA_BOOLEAN;
+	data->active = active;
+	data->next = NULL;
+
+	starputop_enqueue_data(data);
+
+	return data;
+}
+
+starputop_data * starputop_add_data_integer(
+			const char* data_name,
+			int minimum_value,
+			int maximum_value,
+			int active)
+{	
+	starputop_data * data = malloc(sizeof(starputop_data));
+	data->id = starputop_data_cpt++;
+	data->name = data_name; 
+	data->type = STARPUTOP_DATA_INTEGER;
+	data->int_min_value = minimum_value;
+	data->int_max_value = maximum_value;
+	data->active = active;
+	data->next = NULL;
+
+	starputop_enqueue_data(data);
+
+	return data;
+}
+
+starputop_data* starputop_add_data_float(
+			const char* data_name,
+			double minimum_value,
+			double maximum_value,
+			int active)
+{
+	starputop_data * data = malloc(sizeof(starputop_data));
+	data->id = starputop_data_cpt++;
+	data->name = data_name;
+	data->type = STARPUTOP_DATA_FLOAT;
+	data->double_min_value = minimum_value;
+	data->double_max_value = maximum_value;
+	data->active = active;
+	data->next = NULL;
+
+	starputop_enqueue_data(data);
+
+	return data;
+}
+
+char *message_for_topdata_init(starputop_data* data)
+{
+	char*message = malloc(256+strlen(data->name));
+	switch(data->type)
+	{
+		case STARPUTOP_DATA_BOOLEAN:
+			sprintf(message,
+					"BOOL;%d;%s;%d\n",
+					data->id,
+					data->name,
+					data->active ? 1 : 0);
+			break;
+		case STARPUTOP_DATA_INTEGER:
+			sprintf(message,
+					"INT;%d;%s;%d;%d;%d\n",
+					data->id,
+					data->name,
+					data->int_min_value,
+					data->int_max_value,
+					data->active ? 1 : 0);
+			break;
+		case STARPUTOP_DATA_FLOAT:
+			sprintf(message,
+					"FLOAT;%d;%s;%f;%f;%d\n",
+					data->id,
+					data->name,
+					data->double_min_value,
+					data->double_max_value,
+					data->active ? 1 : 0);
+			break;
+	}
+	return message;
+}
+
+char *message_for_topparam_init(starputop_param* param)
+{
+	char*message = NULL;
+	int i;
+	int length=0;
+	switch(param->type)
+	{
+	case STARPUTOP_PARAM_BOOLEAN:
+		message = malloc(256);
+		sprintf(message,
+				"BOOL;%d;%s;%d\n",
+				param->id,
+				param->name,
+				(*(int*)(param->value)) ? 1 : 0);
+		break;
+	case STARPUTOP_PARAM_INTEGER:
+		message = malloc(256);
+		sprintf(message,
+				"INT;%d;%s;%d;%d;%d\n",param->id,
+				param->name,
+				param->int_min_value,
+				param->int_max_value,
+				*(int*)(param->value));
+		break;
+	case STARPUTOP_PARAM_FLOAT:
+		message = malloc(256);
+		sprintf(message,
+				"FLOAT;%d;%s;%f;%f;%f\n",
+				param->id,
+				param->name,
+				param->double_min_value,
+				param->double_max_value,
+				*(double*)(param->value));
+		break;
+	case STARPUTOP_PARAM_ENUM:
+		//compute message lenght
+		for(i = 0; i < param->nb_values; i++)
+		{
+			length += strlen(param->enum_values[i])+1;
+		}
+		message = malloc(256+length);
+		sprintf(message,
+				"ENUM;%d;%s;",
+				param->id,
+				param->name);
+		
+		//compute the begin of enums elements in message
+		char* cur = message+strlen(message);
+		//add each enum element
+		for(i = 0; i < param->nb_values; i++)
+		{
+			strcpy(cur, param->enum_values[i]);
+			cur+=strlen(cur);
+			*cur=';';
+			cur++;
+		}
+		sprintf(cur,
+				"%d\n",
+				*((int*)(param->value)));
+		break;
+	}
+	return message;
+}
+
+void starputop_enqueue_param(starputop_param* param)
+{
+	if(starputop_first_param == NULL)
+	{
+		starputop_first_param = param;
+	}
+	else
+	{
+		starputop_param * cur = starputop_first_param;
+		while(cur->next != NULL)
+			cur = cur->next;
+		cur->next = param;
+	}
+}
+
+
+starputop_param* starputop_register_parameter_boolean(
+			const char* param_name,
+			int* parameter_field,
+			void (*callback)(struct starputop_param_t*))
+{
+    STARPU_ASSERT(!starpu_top_status_get());
+	starputop_param * param = malloc(sizeof(starputop_param));
+	param->callback = callback;
+	param->name = param_name;
+	param->id = starputop_param_cpt++;
+	param->type = STARPUTOP_PARAM_BOOLEAN;
+	param->value = (void*)parameter_field;
+	param->next = NULL;
+	
+	starputop_enqueue_param(param);
+	
+	return param;
+}
+
+
+starputop_param* starputop_register_parameter_integer(const char* param_name,
+			int* parameter_field,
+			int minimum_value,
+			int maximum_value,
+			void (*callback)(struct starputop_param_t*))
+{	
+	STARPU_ASSERT(!starpu_top_status_get());
+	starputop_param * param = malloc(sizeof(starputop_param));
+	param->callback = callback;
+	param->name = param_name;
+	param->id = starputop_param_cpt++;
+	param->type = STARPUTOP_PARAM_INTEGER;
+	param->value = (void*)parameter_field;
+	param->int_min_value = minimum_value;
+	param->int_max_value = maximum_value;
+	param->next = NULL;
+
+	starputop_enqueue_param(param);
+	
+	return param;
+}
+starputop_param* starputop_register_parameter_float(
+			const char* param_name,
+			double* parameter_field,
+			double minimum_value,
+			double maximum_value,
+			void (*callback)(struct starputop_param_t*))
+{
+	STARPU_ASSERT(!starpu_top_status_get());
+	starputop_param * param = malloc(sizeof(starputop_param));
+	param->callback = callback;
+	param->name = param_name;
+	param->id = starputop_param_cpt++;
+	param->type = STARPUTOP_PARAM_FLOAT;
+	param->value = (void*)parameter_field;
+	param->double_min_value = minimum_value;
+	param->double_max_value = maximum_value;
+	param->next = NULL;
+
+	starputop_enqueue_param(param);
+
+	return param;
+}
+
+starputop_param* starputop_register_parameter_enum(
+			const char* param_name,
+			int* parameter_field,
+			char** values,
+			int nb_values,
+			void (*callback)(struct starputop_param_t*))
+{
+	STARPU_ASSERT(!starpu_top_status_get());
+	starputop_param * param = malloc(sizeof(starputop_param));
+	param->callback = callback;
+	param->name = param_name;
+	param->id = starputop_param_cpt++;
+	param->type = STARPUTOP_PARAM_ENUM;
+	param->value = (void*)parameter_field;
+	param->enum_values = values;
+	param->nb_values = nb_values;
+	param->next = NULL;
+	
+	starputop_enqueue_param(param);
+
+	return param;
+}
+/*********************************************
+*****************UPDATE FUNC******************
+**********************************************/
+
+void starputop_update_data_boolean(const starputop_data* data, int value){
+	if (!starpu_top_status_get())
+		return;
+	if(data->active)
+	{
+		char*message = malloc(256+strlen(data->name));
+		sprintf(message,
+				"U;%d;%d;%lld\n",
+				data->id,
+				(value?1:0),
+				current_timestamp());
+		starputop_message_add(starputop_mt,message);
+	}
+}
+void starputop_update_data_integer(const starputop_data* data,int value){
+	if (!starpu_top_status_get())
+		return;
+	if(data->active)
+	{
+		char*message = malloc(256+strlen(data->name));
+		sprintf(message,
+				"U;%d;%d;%lld\n",
+				data->id,
+				value,
+				current_timestamp());
+		starputop_message_add(starputop_mt,message);
+	}
+}
+void starputop_update_data_float(const starputop_data* data, double value){
+	if (!starpu_top_status_get())
+		return;
+	if(data->active)
+	{
+		char*message = malloc(256+strlen(data->name));
+		sprintf(message,
+				"U;%d;%f;%lld\n",
+				data->id, value,
+				current_timestamp());
+		starputop_message_add(starputop_mt,message);
+	}
+}
+void starputop_update_parameter(const starputop_param* param){
+	if (!starpu_top_status_get())
+		return;
+	char*message = malloc(50);
+
+	switch(param->type)
+	{
+		case STARPUTOP_PARAM_BOOLEAN:
+		case STARPUTOP_PARAM_INTEGER:
+		case STARPUTOP_PARAM_ENUM:
+			sprintf(message,
+					"SET;%d;%d;%lld\n",
+					param->id,
+					*((int*)param->value),
+					current_timestamp());
+			break;
+		
+		case STARPUTOP_PARAM_FLOAT:
+			sprintf(message,
+					"SET;%d;%f;%lld\n",
+					param->id,
+					*((double*)param->value),
+					current_timestamp());
+			break;
+	}
+	
+	starputop_message_add(starputop_mt,message);	
+}
+
+/*********************************************
+*****************DEBUG FUNC******************
+**********************************************/
+
+void starputop_debug_log(const char* debug_message)
+{
+	if(starputop_debug_on)
+	{
+		//length can be up to strlen*2, if message contains only unwanted chars
+		char * message = malloc(strlen(debug_message)*2+16);
+		sprintf(message,"MESSAGE;");
+		
+		//escape unwanted char : ; and \n
+		char* cur = message+8;
+		while(*debug_message!='\0')
+		{
+			if(*debug_message=='\n' || *debug_message==';')
+			{
+				*cur='\\';
+				cur++;
+			}
+			*cur = *debug_message;
+			cur++;
+			debug_message++;
+		}
+		*cur='\n';
+		cur++;
+		*cur='\0';
+
+		starputop_message_add(starputop_mt,message);
+	}
+}
+void starputop_debug_lock(const char* debug_message)
+{
+	if(starputop_debug_on)
+	{
+		char * message = malloc(strlen(debug_message)*2+16);
+		sprintf(message,"LOCK;");
+		char* cur = message+5;
+		while(*debug_message!='\0')
+		{
+			if(*debug_message=='\n' || *debug_message==';')
+			{
+				*cur='\\';
+				cur++;
+			}
+			*cur = *debug_message;
+			cur++;
+			debug_message++;
+		}
+		*cur='\n';
+		*(cur+1)='\0';
+
+		starputop_message_add(starputop_mt,message);
+
+		//This threads keeps locked while we don't receive an STEP message
+		pthread_mutex_lock(&starputop_wait_for_continue_mutex);
+		pthread_cond_wait(&starputop_wait_for_continue_cond,&starputop_wait_for_continue_mutex);
+		pthread_mutex_unlock(&starputop_wait_for_continue_mutex);
+	}
+}
+
+ 
+ 
+/********************************************
+ **************TIME FUNCTION****************
+ *******************************************/
+
+unsigned long long int current_timestamp()
+{
+	struct timespec now;
+	starpu_clock_gettime(&now);
+	return starpu_timing_timespec_to_ms(&now);
+}
+
+unsigned long long starpu_timing_timespec_to_ms(const struct timespec *ts)
+{
+  return (1000.0*ts->tv_sec) + (0.000001*ts->tv_nsec);
+}
+
+/********************************************
+ **************INPUT PROCESSING**************
+ *******************************************/
+
+starputop_message_type starputop_get_message_type(const char* message)
+{
+	if(!strncmp("GO\n", message,3))
+		return TOP_TYPE_GO;
+	else if(!strncmp("SET;", message,4))
+		return TOP_TYPE_SET;
+	else if(!strncmp("STEP\n", message,9))
+		return TOP_TYPE_CONTINUE;
+	else if(!strncmp("ENABLE;", message,7))
+		return TOP_TYPE_ENABLE;
+	else if(!strncmp("DISABLE;", message,8))
+		return TOP_TYPE_DISABLE;
+	else if(!strncmp("DEBUG;", message,6))
+		return TOP_TYPE_DEBUG;
+	else 
+		return TOP_TYPE_UNKNOW;
+}
+
+
+void starputop_unlock_starpu()
+{
+	sem_post(&starputop_wait_for_go);
+	printf("%s:%d starpu started\n", __FILE__, __LINE__);
+}
+
+void starputop_change_data_active(char* message, int active)
+{
+	char* debut = strstr(message, ";")+1;
+	char* fin = strstr(debut+1, "\n");
+	*fin = '\0';
+	int data_id = atoi(debut);
+	printf("%s:%d data %d %s\n", __FILE__, __LINE__, data_id, active ? "ENABLED" : "DISABLE");
+	starputop_datas[data_id]->active = active;
+}
+
+void starputop_change_parameter_value(const char* message){
+	const char*tmp = strstr(message, ";")+1;
+	int param_id = atoi(tmp);
+	starputop_param* param = starputop_params[param_id];
+	tmp = strstr(tmp+1,";")+1;
+	int* val_ptr_int;
+	double* val_ptr_double;
+
+	switch(param->type)
+	{
+		case STARPUTOP_PARAM_BOOLEAN:
+		case STARPUTOP_PARAM_INTEGER:
+			val_ptr_int = (int*)param->value;
+			*val_ptr_int = atoi(tmp);
+		break;
+		
+		case STARPUTOP_PARAM_FLOAT:
+			val_ptr_double = (double*)param->value;
+			*val_ptr_double = atof(tmp);
+		break;
+
+		case STARPUTOP_PARAM_ENUM:
+			val_ptr_int = (int*)param->value;
+			*val_ptr_int = atoi(tmp);
+		break;
+		
+	}
+	if(param->callback != NULL)
+		param->callback(param);
+}
+
+void starputop_change_debug_mode(const char*message)
+{
+	const char* debut = strstr(message, ";")+1;
+	if(!strncmp("ON",debut, 2))
+	{
+		starputop_debug_on = 1;
+		printf("%s:%d debug is now ON\n", __FILE__, __LINE__);
+	}
+	else
+	{
+		starputop_debug_on = 0;
+		printf("%s:%d debug is now OFF\n", __FILE__, __LINE__);
+	}
+
+	char * m = malloc(strlen(message)+1);
+	sprintf(m,"%s",message);
+	starputop_message_add(starputop_mt,m);
+}
+
+/*
+ * Unlock starpu if it was locked in debug state
+*/
+void starputop_debug_next_step()
+{
+	pthread_cond_signal(&starputop_wait_for_continue_cond);
+}
+
+
+void starputop_process_input_message(char *buffer)
+{
+	starputop_message_type message_type = starputop_get_message_type(buffer);
+	switch(message_type)
+	{
+		case TOP_TYPE_GO:
+			starputop_unlock_starpu();
+		break;
+		case TOP_TYPE_ENABLE:
+			starputop_change_data_active(buffer, 1);
+		break;
+		case TOP_TYPE_DISABLE:
+			starputop_change_data_active(buffer, 0);
+		break;
+		case TOP_TYPE_SET:
+			starputop_change_parameter_value(buffer);
+		break;
+		case TOP_TYPE_DEBUG:
+			starputop_change_debug_mode(buffer);
+		break;
+		case TOP_TYPE_CONTINUE:
+			starputop_debug_next_step();
+		break;
+		default:
+			printf("%s:%d unknow message : '%s'\n", __FILE__, __LINE__, buffer);
+	}
+}
+
+

+ 168 - 0
src/top/starputop_connection.c

@@ -0,0 +1,168 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_config.h>
+
+#ifdef STARPU_HAVE_WINDOWS
+#  include <w32api.h>
+#  define WINVER WindowsXP
+#  include <ws2tcpip.h>
+#else
+#  include <sys/socket.h>
+#  include <netinet/in.h>
+#  include <netdb.h>
+#endif
+
+#include <top/starputop_connection.h>
+#include <top/starputop_message_queue.h>
+#include <starpu_top.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+const char *STARPUTOP_PORT = "2011";
+const int STARPUTOP_BUFFER_SIZE=1024;
+
+extern starputop_message_queue_t*  starputop_mt;
+
+//client socket after fopen
+FILE* starputop_socket_fd_read;
+FILE* starputop_socket_fd_write;
+//client socket (file descriptor)
+int starputop_socket_fd;
+
+
+void * message_from_ui(void * p)
+{
+	(void) p;
+	char str[STARPUTOP_BUFFER_SIZE];
+	while(1)
+	{
+		char * check=fgets (str, STARPUTOP_BUFFER_SIZE, starputop_socket_fd_read);
+
+		printf("Message from UI : %s",str);
+		if (check)
+		{
+			starputop_process_input_message(str);
+		}
+		else
+		{
+			fprintf(stderr,"Connection dropped\n");
+			//unlocking StarPU.
+			starputop_process_input_message("GO\n");
+			starputop_process_input_message("DEBUG;OFF\n");
+			starputop_process_input_message("STEP\n");
+			return NULL;
+		}
+	}
+	return NULL;
+}
+
+void * message_to_ui(void * p)
+{
+	(void) p;
+	while(1)
+	{
+		char* message = starputop_message_remove(starputop_mt);
+		int len=strlen(message);
+		int check=fwrite(message, sizeof(char), len, starputop_socket_fd_write);
+		int check2=fflush(starputop_socket_fd_write);
+		free(message);
+		if (check!=len || check2==EOF )
+		{
+			fprintf(stderr,"Connection dropped : message no longer send\n");
+			while(1)
+			{
+				message=starputop_message_remove(starputop_mt);
+				free(message);
+			}
+		}
+	}
+	return NULL;
+}
+
+void starputop_communications_threads_launcher()
+{
+	pthread_t from_ui;
+	pthread_t to_ui;
+	pthread_attr_t threads_attr;
+
+  
+	//Connection to UI & Socket Initilization
+	printf("%s:%d Connection to UI initilization\n",__FILE__, __LINE__);
+	struct sockaddr_storage from;
+	struct addrinfo req, *ans;
+	int code;
+	req.ai_flags = AI_PASSIVE;
+	req.ai_family = PF_UNSPEC;            
+	req.ai_socktype = SOCK_STREAM;
+	req.ai_protocol = 0;  
+  
+	if ((code = getaddrinfo(NULL, STARPUTOP_PORT, &req, &ans)) != 0)
+	{
+		fprintf(stderr, " getaddrinfo failed %d\n", code);
+		exit(EXIT_FAILURE);
+   	}
+  	int sock=socket(ans->ai_family, ans->ai_socktype, ans->ai_protocol);
+	int optval = 1;
+	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
+
+	if (bind(sock, ans->ai_addr, ans->ai_addrlen) < 0)
+	{
+		perror("bind");
+		exit(EXIT_FAILURE);
+	}
+
+	listen(sock, 2);
+
+	socklen_t len = sizeof(from);
+
+   	if ((starputop_socket_fd=accept(sock, (struct sockaddr *) &from, &len)) ==-1)
+	{
+		fprintf(stderr, "accept error\n");
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+	
+	if ( (starputop_socket_fd_read=fdopen(starputop_socket_fd, "r")) == NULL)
+	{
+		perror("fdopen");
+		exit(EXIT_FAILURE);
+	}
+
+	starputop_socket_fd=dup(starputop_socket_fd);
+	
+	if ((starputop_socket_fd_write=fdopen(starputop_socket_fd, "w")) == NULL)
+	{
+		perror("fdopen");
+		exit(EXIT_FAILURE);
+	}
+	
+	close(sock);
+	
+	//Threads creation
+	fprintf(stderr,"Threads Creation\n");
+	pthread_attr_init(&threads_attr);
+	pthread_attr_setdetachstate(&threads_attr, PTHREAD_CREATE_DETACHED);
+	
+	pthread_create(&from_ui, &threads_attr, message_from_ui, NULL);
+	pthread_create(&to_ui, &threads_attr, message_to_ui, NULL);
+}
+

+ 44 - 0
src/top/starputop_connection.h

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPUTOP_CONNECTION_H__
+#define __STARPUTOP_CONNECTION_H__
+
+#include <stdlib.h>
+#include <top/starputop_message_queue.h>
+#include <starpu_top.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  extern starputop_message_queue_t*  starputop_mt;
+
+/*
+ * This function initialize the two communications threads.
+ * It initializes the connection and then launches the threads.
+ * The function wait the UI connection before launching the threads.
+ * About mt : mt MUST be allocated before call. 
+ * All messages in the queue are freed after used. 
+ */
+  void starputop_communications_threads_launcher();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPUTOP_CONNECTION_H__
+

+ 109 - 0
src/top/starputop_message_queue.c

@@ -0,0 +1,109 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include  "starputop_message_queue.h"
+#include  <string.h>
+#include  <stdio.h>
+#include  <stdlib.h>
+
+//this global queue is used both by API and by network threads
+starputop_message_queue_t*  starputop_mt = NULL;
+
+
+/* Will always return the pointer to starputop_message_queue */
+starputop_message_queue_t* starputop_message_add(
+			starputop_message_queue_t* s,
+			char* msg)
+{
+	starputop_message_queue_item_t* p = malloc( 1 * sizeof(*p) );
+	pthread_mutex_lock(&(s->mutex));
+	if( NULL == p )
+	{
+		fprintf(stderr, "IN %s, %s: malloc() failed\n", __FILE__, "list_add");
+		pthread_mutex_unlock(&(s->mutex));
+		return s;
+	}
+
+	p->message = msg;
+	p->next = NULL;
+
+	if( NULL == s )
+	{
+		printf("Queue not initialized\n");
+		pthread_mutex_unlock(&(s->mutex));
+		return s;
+	}
+	else if( NULL == s->head && NULL == s->tail )
+	{
+		/* printf("Empty list, adding p->num: %d\n\n", p->num);  */
+		sem_post(&(s->semaphore));
+		s->head = s->tail = p;
+		pthread_mutex_unlock(&(s->mutex));
+		return s;
+	}
+	else
+	{
+		/* printf("List not empty, adding element to tail\n"); */
+		sem_post(&(s->semaphore));
+		s->tail->next = p;
+		s->tail = p;
+	}
+	pthread_mutex_unlock(&(s->mutex));
+	return s;
+}
+
+//this is a queue and it is FIFO, so we will always remove the first element
+char* starputop_message_remove(starputop_message_queue_t* s)
+{
+	sem_wait(&(s->semaphore));
+	starputop_message_queue_item_t* h = NULL;
+	starputop_message_queue_item_t* p = NULL;
+
+	if( NULL == s )
+	{
+		printf("List is null\n");
+		return NULL;
+	}
+	pthread_mutex_lock(&(s->mutex));
+	h = s->head;
+	p = h->next;
+	char* value = h->message;
+	free(h);
+	s->head = p;
+
+	
+	if( NULL == s->head )
+		//the element tail was pointing to is free(), so we need an update
+		s->tail = s->head;
+	pthread_mutex_unlock(&(s->mutex));
+	return value;
+}
+
+
+starputop_message_queue_t* starputop_message_queue_new(void)
+{
+	starputop_message_queue_t* p = malloc( 1 * sizeof(*p));
+	if( NULL == p )
+	{
+		fprintf(stderr, "LINE: %d, malloc() failed\n", __LINE__);
+	}
+
+	p->head = p->tail = NULL;
+	sem_init(&(p->semaphore),0,0);
+	pthread_mutex_init(&(p->mutex), NULL);
+	return p;
+}

+ 50 - 0
src/top/starputop_message_queue.h

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sys/types.h>
+#include <semaphore.h> 
+#include <pthread.h>
+
+#ifndef __STARPUTOP_MESSAGE_QUEUE_H__
+#define __STARPUTOP_MESSAGE_QUEUE_H__
+
+typedef struct starputop_message_queue_item
+{
+	char *message;
+	struct starputop_message_queue_item* next;
+} starputop_message_queue_item_t;
+
+typedef struct starputop_message_queue
+{
+	struct starputop_message_queue_item* head;
+	struct starputop_message_queue_item* tail;
+	sem_t semaphore;
+	pthread_mutex_t mutex;
+} starputop_message_queue_t;
+
+
+starputop_message_queue_t *starputop_message_add(
+			starputop_message_queue_t*,
+			char*);
+
+char* starputop_message_remove(starputop_message_queue_t*);
+
+starputop_message_queue_t* starputop_message_queue_new();
+starputop_message_queue_t* starputop_message_queue_free(
+			starputop_message_queue_t*);
+
+#endif

+ 97 - 0
src/top/starputop_task.c

@@ -0,0 +1,97 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
+ * Roy
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_top.h>
+#include <top/starputop_message_queue.h>
+#include <top/starputop_connection.h>
+#include <core/task.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <common/timing.h>
+
+/********************************************
+ **************TASK RELATED FUNCTIONS********
+ *******************************************/
+
+void starputop_task_started(
+			struct starpu_task *task, 
+			int devid, 
+			const struct timespec *ts)
+{
+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
+	STARPU_ASSERT(starpu_top_status_get());
+	char *str = malloc(sizeof(char)*64);
+	snprintf(str, 64,
+				"START;%llu;%d;%llu\n",
+				taskid, 
+				devid, 
+				starpu_timing_timespec_to_ms(ts));
+
+	starputop_message_add(starputop_mt, str);
+}
+
+void starputop_task_ended(
+			struct starpu_task *task, 
+			int devid, 
+			const struct timespec *ts)
+{
+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
+	(void) devid; //unused
+	STARPU_ASSERT(starpu_top_status_get());
+	char *str = malloc(sizeof(char)*64);
+	snprintf(str, 64,
+				"END;%llu;%llu\n", 
+				taskid, 
+				starpu_timing_timespec_to_ms(ts));
+
+	starputop_message_add(starputop_mt, str);
+}
+
+void starputop_task_prevision_timespec(
+			struct starpu_task *task,
+			int devid, 
+			const struct timespec* start, 
+			const struct timespec* end)
+{
+	starputop_task_prevision(task, 
+							devid, 
+							starpu_timing_timespec_to_ms(start),
+							starpu_timing_timespec_to_ms(end));
+}
+
+void starputop_task_prevision(
+			struct starpu_task *task, 
+			int devid, 
+			unsigned long long start, 
+			unsigned long long end)
+{
+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
+	STARPU_ASSERT(starpu_top_status_get());
+	struct timespec now;
+	starpu_clock_gettime(&now);
+	char * str=malloc(sizeof(char)*200);
+	snprintf(str, 128, 
+				"PREV;%llu;%d;%llu;%llu;%llu\n",
+				taskid,
+				devid,
+				starpu_timing_timespec_to_ms(&now),
+				start,
+				end);
+
+	starputop_message_add(starputop_mt, str);
+}

+ 7 - 7
src/util/malloc.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,7 @@ struct malloc_pinned_codelet_struct {
 #endif
 
 //#ifdef STARPU_USE_OPENCL
-//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//static void malloc_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 //{
 //	struct malloc_pinned_codelet_struct *s = arg;
 //        //        *(s->ptr) = malloc(s->dim);
@@ -40,7 +40,7 @@ struct malloc_pinned_codelet_struct {
 //#endif
 
 #ifdef STARPU_USE_CUDA
-static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
+static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	struct malloc_pinned_codelet_struct *s = arg;
 
@@ -67,7 +67,7 @@ static starpu_codelet malloc_pinned_cl = {
 };
 #endif
 
-int starpu_data_malloc_pinned_if_possible(void **A, size_t dim)
+int starpu_malloc(void **A, size_t dim)
 {
 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 		return -EDEADLK;
@@ -132,7 +132,7 @@ int starpu_data_malloc_pinned_if_possible(void **A, size_t dim)
 }
 
 #ifdef STARPU_USE_CUDA
-static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
+static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	cudaError_t cures;
 	cures = cudaFreeHost(arg);
@@ -142,7 +142,7 @@ static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), vo
 #endif
 
 //#ifdef STARPU_USE_OPENCL
-//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
+//static void free_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 //{
 //        //        free(arg);
 //        int err = clReleaseMemObject(arg);
@@ -166,7 +166,7 @@ static starpu_codelet free_pinned_cl = {
 };
 #endif
 
-int starpu_data_free_pinned_if_possible(void *A)
+int starpu_free(void *A)
 {
 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 		return -EDEADLK;

+ 6 - 4
src/util/starpu_cublas.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,16 +20,18 @@
 #include <common/config.h>
 
 #ifdef STARPU_USE_CUDA
-static void init_cublas_func(void *args __attribute__((unused)))
+static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 {
 	cublasStatus cublasst = cublasInit();
 	if (STARPU_UNLIKELY(cublasst))
 		STARPU_CUBLAS_REPORT_ERROR(cublasst);
 
+#if CUDA_VERSION >= 3010
 	cublasSetKernelStream(starpu_cuda_get_local_stream());
+#endif
 }
 
-static void shutdown_cublas_func(void *args __attribute__((unused)))
+static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 {
 	cublasShutdown();
 }

+ 2 - 2
src/util/starpu_insert_task.c

@@ -62,7 +62,7 @@ void starpu_unpack_cl_args(void *_cl_arg, ...)
 	va_end(varg_list);
 }
 
-void starpu_insert_task(starpu_codelet *cl, ...)
+int starpu_insert_task(starpu_codelet *cl, ...)
 {
 	va_list varg_list;
 
@@ -77,5 +77,5 @@ void starpu_insert_task(starpu_codelet *cl, ...)
 
 	va_start(varg_list, cl);
         struct starpu_task *task = starpu_task_create();
-        _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
+        return _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
 }

+ 19 - 11
src/util/starpu_insert_task_utils.c

@@ -27,6 +27,7 @@ struct insert_task_cb_wrapper {
 	void *arg_stack;
 };
 
+static
 void starpu_task_insert_callback_wrapper(void *_cl_arg_wrapper)
 {
 	struct insert_task_cb_wrapper *cl_arg_wrapper = _cl_arg_wrapper;
@@ -68,9 +69,12 @@ size_t _starpu_insert_task_get_arg_size(va_list varg_list)
 		else if (arg_type==STARPU_PRIORITY) {
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
 			va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, starpu_data_handle);
+		}
 	}
 
 	va_end(varg_list);
@@ -122,18 +126,23 @@ int _starpu_pack_cl_args(size_t arg_buffer_size, char **arg_buffer, va_list varg
 		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
 			va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, starpu_data_handle);
+		}
 	}
 
 	(*arg_buffer)[0] = nargs;
 	va_end(varg_list);
 	return 0;
 }
-static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list, unsigned *ctx) {
-        int arg_type;
+
+int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list) {
+    int arg_type;
 	unsigned current_buffer = 0;
+	unsigned ctx = 0;
 
 	struct insert_task_cb_wrapper *cl_arg_wrapper = malloc(sizeof(struct insert_task_cb_wrapper));
 	STARPU_ASSERT(cl_arg_wrapper);
@@ -176,9 +185,13 @@ static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct st
 			int prio = va_arg(varg_list, int); 
 			(*task)->priority = prio;
 		}
-		else if (arg_type==STARPU_EXECUTE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
 			va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, starpu_data_handle);
+		}
+
 		else if (arg_type==STARPU_CTX) {
 			*ctx = va_arg(varg_list, unsigned);
 		}
@@ -196,16 +209,11 @@ static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct st
 	 * application's callback, if any. */
 	(*task)->callback_func = starpu_task_insert_callback_wrapper;
 	(*task)->callback_arg = cl_arg_wrapper;
-}
 
-int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list) {
-	unsigned ctx = 0;
-	_starpu_prepare_task(arg_buffer, cl, task, varg_list, &ctx);
 	 int ret = ctx == 0 ? starpu_task_submit(*task) : starpu_task_submit_to_ctx(*task, ctx);
 
 	if (STARPU_UNLIKELY(ret == -ENODEV))
-          fprintf(stderr, "No one can execute task %p wih cl %p (symbol %s)\n", *task, (*task)->cl, ((*task)->cl->model && (*task)->cl->model->symbol)?(*task)->cl->model->symbol:"none");
+          fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s')\n", *task, (*task)->cl, ((*task)->cl->model && (*task)->cl->model->symbol)?(*task)->cl->model->symbol:"none");
 
-	STARPU_ASSERT(!ret);
         return ret;
 }