Browse Source

do not forget src/ and tests/ :]

Cédric Augonnet 16 years ago
parent
commit
41f93730d4
100 changed files with 11487 additions and 0 deletions
  1. 139 0
      src/Makefile.am
  2. 66 0
      src/common/fxt.c
  3. 205 0
      src/common/fxt.h
  4. 45 0
      src/common/hash.c
  5. 24 0
      src/common/hash.h
  6. 101 0
      src/common/htable32.c
  7. 35 0
      src/common/htable32.h
  8. 167 0
      src/common/list.h
  9. 80 0
      src/common/malloc.c
  10. 43 0
      src/common/mutex.c
  11. 151 0
      src/common/rwlock.c
  12. 40 0
      src/common/rwlock.h
  13. 144 0
      src/common/timing.c
  14. 77 0
      src/common/timing.h
  15. 227 0
      src/core/dependencies/data-concurrency.c
  16. 33 0
      src/core/dependencies/data-concurrency.h
  17. 174 0
      src/core/dependencies/htable.c
  18. 42 0
      src/core/dependencies/htable.h
  19. 266 0
      src/core/dependencies/tags.c
  20. 78 0
      src/core/dependencies/tags.h
  21. 184 0
      src/core/jobs.c
  22. 80 0
      src/core/jobs.h
  23. 21 0
      src/core/mechanisms/TODO
  24. 209 0
      src/core/mechanisms/deque_queues.c
  25. 54 0
      src/core/mechanisms/deque_queues.h
  26. 245 0
      src/core/mechanisms/fifo_queues.c
  27. 50 0
      src/core/mechanisms/fifo_queues.h
  28. 121 0
      src/core/mechanisms/priority_queues.c
  29. 43 0
      src/core/mechanisms/priority_queues.h
  30. 80 0
      src/core/mechanisms/queues.c
  31. 72 0
      src/core/mechanisms/queues.h
  32. 228 0
      src/core/mechanisms/stack_queues.c
  33. 55 0
      src/core/mechanisms/stack_queues.h
  34. 137 0
      src/core/perfmodel/perfmodel.c
  35. 95 0
      src/core/perfmodel/perfmodel.h
  36. 513 0
      src/core/perfmodel/perfmodel_history.c
  37. 225 0
      src/core/perfmodel/regression.c
  38. 28 0
      src/core/perfmodel/regression.h
  39. 226 0
      src/core/policies/deque-modeling-policy-data-aware.c
  40. 29 0
      src/core/policies/deque-modeling-policy-data-aware.h
  41. 161 0
      src/core/policies/deque-modeling-policy.c
  42. 29 0
      src/core/policies/deque-modeling-policy.h
  43. 58 0
      src/core/policies/eager-central-policy.c
  44. 27 0
      src/core/policies/eager-central-policy.h
  45. 52 0
      src/core/policies/eager-central-priority-policy.c
  46. 28 0
      src/core/policies/eager-central-priority-policy.h
  47. 57 0
      src/core/policies/no-prio-policy.c
  48. 27 0
      src/core/policies/no-prio-policy.h
  49. 121 0
      src/core/policies/random-policy.c
  50. 29 0
      src/core/policies/random-policy.h
  51. 159 0
      src/core/policies/sched_policy.c
  52. 56 0
      src/core/policies/sched_policy.h
  53. 201 0
      src/core/policies/work-stealing-policy.c
  54. 26 0
      src/core/policies/work-stealing-policy.h
  55. 392 0
      src/core/workers.c
  56. 110 0
      src/core/workers.h
  57. 68 0
      src/datawizard/Makefile
  58. 395 0
      src/datawizard/coherency.c
  59. 160 0
      src/datawizard/coherency.h
  60. 230 0
      src/datawizard/copy-driver.c
  61. 67 0
      src/datawizard/copy-driver.h
  62. 22 0
      src/datawizard/data_parameters.h
  63. 111 0
      src/datawizard/data_request.c
  64. 39 0
      src/datawizard/data_request.h
  65. 128 0
      src/datawizard/datastats.c
  66. 40 0
      src/datawizard/datastats.h
  67. 41 0
      src/datawizard/datawizard.h
  68. 45 0
      src/datawizard/footprint.c
  69. 27 0
      src/datawizard/footprint.h
  70. 327 0
      src/datawizard/hierarchy.c
  71. 28 0
      src/datawizard/hierarchy.h
  72. 29 0
      src/datawizard/interfaces/Makefile
  73. 77 0
      src/datawizard/interfaces/bcsr_filters.c
  74. 22 0
      src/datawizard/interfaces/bcsr_filters.h
  75. 491 0
      src/datawizard/interfaces/bcsr_interface.c
  76. 26 0
      src/datawizard/interfaces/bcsr_interface.h
  77. 113 0
      src/datawizard/interfaces/blas_filters.c
  78. 22 0
      src/datawizard/interfaces/blas_filters.h
  79. 413 0
      src/datawizard/interfaces/blas_interface.c
  80. 24 0
      src/datawizard/interfaces/blas_interface.h
  81. 24 0
      src/datawizard/interfaces/csc_interface.h
  82. 74 0
      src/datawizard/interfaces/csr_filters.c
  83. 22 0
      src/datawizard/interfaces/csr_filters.h
  84. 451 0
      src/datawizard/interfaces/csr_interface.c
  85. 26 0
      src/datawizard/interfaces/csr_interface.h
  86. 57 0
      src/datawizard/interfaces/data_interface.h
  87. 146 0
      src/datawizard/interfaces/vector_filters.c
  88. 22 0
      src/datawizard/interfaces/vector_filters.h
  89. 340 0
      src/datawizard/interfaces/vector_interface.c
  90. 24 0
      src/datawizard/interfaces/vector_interface.h
  91. 537 0
      src/datawizard/memalloc.c
  92. 51 0
      src/datawizard/memalloc.h
  93. 49 0
      src/datawizard/progress.c
  94. 24 0
      src/datawizard/progress.h
  95. 76 0
      src/datawizard/write_back.c
  96. 27 0
      src/datawizard/write_back.h
  97. 31 0
      src/drivers/core/Makefile
  98. 157 0
      src/drivers/core/driver_core.c
  99. 39 0
      src/drivers/core/driver_core.h
  100. 0 0
      src/drivers/cuda/Makefile

+ 139 - 0
src/Makefile.am

@@ -0,0 +1,139 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+SUBDIRS =
+
+lib_LTLIBRARIES = libstarpu.la
+
+libstarpu_la_CPPFLAGS = -I$(top_srcdir)/include/
+
+libstarpu_la_CFLAGS = -W -Wall -Wextra
+libstarpu_la_LIBS = -lm
+
+noinst_HEADERS = 						\
+	core/dependencies/data-concurrency.h			\
+	core/dependencies/tags.h				\
+	core/dependencies/htable.h				\
+	core/policies/eager-central-priority-policy.h		\
+	core/policies/sched_policy.h				\
+	core/policies/random-policy.h				\
+	core/policies/eager-central-policy.h			\
+	core/policies/deque-modeling-policy.h			\
+	core/policies/no-prio-policy.h				\
+	core/policies/deque-modeling-policy-data-aware.h	\
+	core/policies/work-stealing-policy.h			\
+	core/mechanisms/priority_queues.h			\
+	core/mechanisms/fifo_queues.h				\
+	core/mechanisms/deque_queues.h				\
+	core/mechanisms/queues.h				\
+	core/mechanisms/stack_queues.h				\
+	core/perfmodel/perfmodel.h				\
+	core/perfmodel/regression.h				\
+	core/jobs.h						\
+	core/workers.h						\
+	datawizard/footprint.h					\
+	datawizard/datawizard.h					\
+	datawizard/data_request.h				\
+	datawizard/hierarchy.h					\
+	datawizard/progress.h					\
+	datawizard/write_back.h					\
+	datawizard/datastats.h					\
+	datawizard/memalloc.h					\
+	datawizard/data_parameters.h				\
+	datawizard/copy-driver.h				\
+	datawizard/coherency.h					\
+	datawizard/interfaces/blas_interface.h			\
+	datawizard/interfaces/csr_filters.h			\
+	datawizard/interfaces/csc_interface.h			\
+	datawizard/interfaces/bcsr_filters.h			\
+	datawizard/interfaces/bcsr_interface.h			\
+	datawizard/interfaces/data_interface.h			\
+	datawizard/interfaces/vector_filters.h			\
+	datawizard/interfaces/vector_interface.h		\
+	datawizard/interfaces/blas_filters.h			\
+	datawizard/interfaces/csr_interface.h			\
+	common/hash.h						\
+	common/timing.h						\
+	common/htable32.h					\
+	common/list.h						\
+	common/rwlock.h						\
+	common/fxt.h						\
+	drivers/core/driver_core.h				\
+	drivers/gordon/driver_gordon.h				\
+	drivers/gordon/gordon_interface.h			\
+	drivers/cuda/driver_cuda.h				\
+	drivers/cuda/comp_cuda.h			
+
+libstarpu_la_SOURCES = 						\
+	common/malloc.c						\
+	common/hash.c 						\
+	common/htable32.c					\
+	common/mutex.c						\
+	common/rwlock.c						\
+	common/timing.c						\
+	core/jobs.c						\
+	core/workers.c						\
+	core/dependencies/tags.c				\
+	core/dependencies/htable.c				\
+	core/dependencies/data-concurrency.c			\
+	core/mechanisms/queues.c				\
+	core/mechanisms/deque_queues.c				\
+	core/mechanisms/priority_queues.c			\
+	core/mechanisms/fifo_queues.c				\
+	core/perfmodel/perfmodel_history.c			\
+	core/perfmodel/perfmodel.c				\
+	core/perfmodel/regression.c				\
+	core/policies/no-prio-policy.c				\
+	core/policies/eager-central-policy.c			\
+	core/policies/eager-central-priority-policy.c		\
+	core/policies/work-stealing-policy.c			\
+	core/policies/sched_policy.c				\
+	core/policies/deque-modeling-policy.c			\
+	core/policies/random-policy.c				\
+	core/policies/deque-modeling-policy-data-aware.c	\
+	datawizard/write_back.c					\
+	datawizard/coherency.c					\
+	datawizard/data_request.c				\
+	datawizard/progress.c					\
+	datawizard/copy-driver.c				\
+	datawizard/hierarchy.c					\
+	datawizard/memalloc.c					\
+	datawizard/footprint.c					\
+	datawizard/datastats.c					\
+	datawizard/interfaces/bcsr_interface.c			\
+	datawizard/interfaces/csr_interface.c			\
+	datawizard/interfaces/blas_filters.c			\
+	datawizard/interfaces/blas_interface.c			\
+	datawizard/interfaces/vector_interface.c		\
+	datawizard/interfaces/bcsr_filters.c			\
+	datawizard/interfaces/csr_filters.c			\
+	datawizard/interfaces/vector_filters.c
+	
+if USE_CPU
+libstarpu_la_SOURCES += drivers/core/driver_core.c
+endif
+
+if USE_CUDA
+libstarpu_la_SOURCES += drivers/cuda/driver_cuda.c
+endif
+
+if USE_GORDON
+libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
+endif
+
+if USE_FXT
+libstarpu_la_SOURCES += common/fxt.c
+endif

+ 66 - 0
src/common/fxt.c

@@ -0,0 +1,66 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/fxt.h>
+
+#define PROF_BUFFER_SIZE  (8*1024*1024)
+
+static char PROF_FILE_USER[128];
+static int fxt_started = 0;
+
+void profile_stop(void)
+{
+	fut_endup(PROF_FILE_USER);
+}
+
+void profile_set_tracefile(char *fmt, ...)
+{
+	va_list vl;
+	
+	va_start(vl, fmt);
+	vsprintf(PROF_FILE_USER, fmt, vl);
+	va_end(vl);
+	strcat(PROF_FILE_USER, "_user_");
+}
+
+
+void start_fxt_profiling(void)
+{
+	unsigned threadid;
+
+	if (!fxt_started) {
+		fxt_started = 1;
+		profile_set_tracefile("/tmp/prof_file");
+	}
+
+	threadid = syscall(SYS_gettid);
+
+	atexit(profile_stop);
+
+	if(fut_setup(PROF_BUFFER_SIZE, FUT_KEYMASKALL, threadid) < 0) {
+		perror("fut_setup");
+		STARPU_ASSERT(0);
+	}
+
+	fut_keychange(FUT_ENABLE, FUT_KEYMASKALL, threadid);
+
+	return;
+}
+
+void fxt_register_thread(unsigned coreid)
+{
+	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, coreid, syscall(SYS_gettid));
+}

+ 205 - 0
src/common/fxt.h

@@ -0,0 +1,205 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __FXT_H__
+#define __FXT_H__
+
+
+#define _GNU_SOURCE  /* ou _BSD_SOURCE ou _SVID_SOURCE */
+#include <unistd.h>
+#include <sys/syscall.h> /* pour les définitions de SYS_xxx */
+
+#include <string.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <common/config.h>
+#include <starpu.h>
+
+/* some key to identify the worker kind */
+#define FUT_APPS_KEY	0x100
+#define FUT_CORE_KEY	0x101
+#define FUT_CUDA_KEY	0x102
+
+#define	FUT_NEW_WORKER_KEY	0x5102
+#define	FUT_START_CODELET_BODY	0x5103
+#define	FUT_END_CODELET_BODY	0x5104
+
+#define FUT_JOB_PUSH		0x5105
+#define FUT_JOB_POP		0x5106
+
+#define FUT_START_FETCH_INPUT	0x5107
+#define FUT_END_FETCH_INPUT	0x5108
+#define FUT_START_PUSH_OUTPUT	0x5109
+#define FUT_END_PUSH_OUTPUT	0x5110
+
+#define FUT_CODELET_TAG		0x5111
+#define FUT_CODELET_TAG_DEPS	0x5112
+
+#define FUT_DATA_COPY		0x5113
+#define FUT_WORK_STEALING	0x5114
+
+#define	FUT_WORKER_TERMINATED	0x5115
+
+#define FUT_USER_DEFINED_START	0x5116
+#define FUT_USER_DEFINED_END	0x5117
+
+#define	FUT_NEW_MEM_NODE	0x5118
+
+#define	FUT_START_CALLBACK	0x5119
+#define	FUT_END_CALLBACK	0x5120
+
+#define	FUT_TASK_DONE		0x5121
+
+#define	FUT_START_ALLOC		0x5122
+#define	FUT_END_ALLOC		0x5123
+
+#define	FUT_START_ALLOC_REUSE	0x5128
+#define	FUT_END_ALLOC_REUSE	0x5129
+
+#define	FUT_START_MEMRECLAIM	0x5124
+#define	FUT_END_MEMRECLAIM	0x5125
+
+#define	FUT_START_DRIVER_COPY	0x5126
+#define	FUT_END_DRIVER_COPY	0x5127
+
+
+#ifdef USE_FXT
+#include <fxt/fxt.h>
+#include <fxt/fut.h>
+
+void start_fxt_profiling(void);
+void fxt_register_thread(unsigned);
+
+/* workerkind = FUT_CORE_KEY for instance */
+#define TRACE_NEW_MEM_NODE(nodeid)	\
+	FUT_DO_PROBE2(FUT_NEW_MEM_NODE, nodeid, syscall(SYS_gettid));
+
+#define TRACE_NEW_WORKER(workerkind,memnode)	\
+	FUT_DO_PROBE3(FUT_NEW_WORKER_KEY, workerkind, memnode, syscall(SYS_gettid));
+
+#define TRACE_START_CODELET_BODY(job)	\
+	FUT_DO_PROBE2(FUT_START_CODELET_BODY, job, syscall(SYS_gettid));
+
+#define TRACE_END_CODELET_BODY(job)	\
+	FUT_DO_PROBE2(FUT_END_CODELET_BODY, job, syscall(SYS_gettid));
+
+#define TRACE_START_CALLBACK(job)	\
+	FUT_DO_PROBE2(FUT_START_CALLBACK, job, syscall(SYS_gettid));
+
+#define TRACE_END_CALLBACK(job)	\
+	FUT_DO_PROBE2(FUT_END_CALLBACK, job, syscall(SYS_gettid));
+
+#define TRACE_JOB_PUSH(task, prio)	\
+	FUT_DO_PROBE3(FUT_JOB_PUSH, task, prio, syscall(SYS_gettid));
+
+#define TRACE_JOB_POP(task, prio)	\
+	FUT_DO_PROBE3(FUT_JOB_POP, task, prio, syscall(SYS_gettid));
+
+#define TRACE_START_FETCH_INPUT(job)	\
+	FUT_DO_PROBE2(FUT_START_FETCH_INPUT, job, syscall(SYS_gettid));
+
+#define TRACE_END_FETCH_INPUT(job)	\
+	FUT_DO_PROBE2(FUT_END_FETCH_INPUT, job, syscall(SYS_gettid));
+
+#define TRACE_START_PUSH_OUTPUT(job)	\
+	FUT_DO_PROBE2(FUT_START_PUSH_OUTPUT, job, syscall(SYS_gettid));
+
+#define TRACE_END_PUSH_OUTPUT(job)	\
+	FUT_DO_PROBE2(FUT_END_PUSH_OUTPUT, job, syscall(SYS_gettid));
+
+#define TRACE_CODELET_TAG(tag, job)	\
+	FUT_DO_PROBE2(FUT_CODELET_TAG, tag, job)
+
+#define TRACE_CODELET_TAG_DEPS(tag_child, tag_father)	\
+	FUT_DO_PROBE2(FUT_CODELET_TAG_DEPS, tag_child, tag_father)
+
+#define TRACE_TASK_DONE(tag)	\
+	FUT_DO_PROBE2(FUT_TASK_DONE, tag, syscall(SYS_gettid))
+
+#define TRACE_DATA_COPY(src_node, dst_node, size)	\
+	FUT_DO_PROBE3(FUT_DATA_COPY, src_node, dst_node, size)
+
+#define TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id)	\
+	FUT_DO_PROBE4(FUT_START_DRIVER_COPY, src_node, dst_node, size, com_id)
+
+#define TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id)	\
+	FUT_DO_PROBE4(FUT_END_DRIVER_COPY, src_node, dst_node, size, com_id)
+
+#define TRACE_WORK_STEALING(empty_q, victim_q)		\
+	FUT_DO_PROBE2(FUT_WORK_STEALING, empty_q, victim_q)
+
+#define TRACE_WORKER_TERMINATED(workerkind)	\
+	FUT_DO_PROBE2(FUT_WORKER_TERMINATED, workerkind, syscall(SYS_gettid));
+
+#define TRACE_USER_DEFINED_START	\
+	FUT_DO_PROBE1(FUT_USER_DEFINED_START, syscall(SYS_gettid));
+
+#define TRACE_USER_DEFINED_END		\
+	FUT_DO_PROBE1(FUT_USER_DEFINED_END, syscall(SYS_gettid));
+
+#define TRACE_START_ALLOC(memnode)		\
+	FUT_DO_PROBE2(FUT_START_ALLOC, memnode, syscall(SYS_gettid));
+	
+#define TRACE_END_ALLOC(memnode)		\
+	FUT_DO_PROBE2(FUT_END_ALLOC, memnode, syscall(SYS_gettid));
+
+#define TRACE_START_ALLOC_REUSE(memnode)		\
+	FUT_DO_PROBE2(FUT_START_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	
+#define TRACE_END_ALLOC_REUSE(memnode)		\
+	FUT_DO_PROBE2(FUT_END_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	
+#define TRACE_START_MEMRECLAIM(memnode)		\
+	FUT_DO_PROBE2(FUT_START_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	
+#define TRACE_END_MEMRECLAIM(memnode)		\
+	FUT_DO_PROBE2(FUT_END_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	
+
+#else // !USE_FXT
+
+#define TRACE_NEW_MEM_NODE(nodeid)	do {} while(0);
+#define TRACE_NEW_WORKER(a,b)		do {} while(0);
+#define TRACE_START_CODELET_BODY(job)	do {} while(0);
+#define TRACE_END_CODELET_BODY(job)	do {} while(0);
+#define TRACE_START_CALLBACK(job)	do {} while(0);
+#define TRACE_END_CALLBACK(job)		do {} while(0);
+#define TRACE_JOB_PUSH(task, prio)	do {} while(0);
+#define TRACE_JOB_POP(task, prio)	do {} while(0);
+#define TRACE_START_FETCH_INPUT(job)	do {} while(0);
+#define TRACE_END_FETCH_INPUT(job)	do {} while(0);
+#define TRACE_START_PUSH_OUTPUT(job)	do {} while(0);
+#define TRACE_END_PUSH_OUTPUT(job)	do {} while(0);
+#define TRACE_CODELET_TAG(tag, job)	do {} while(0);
+#define TRACE_CODELET_TAG_DEPS(a, b)	do {} while(0);
+#define TRACE_TASK_DONE(tag)		do {} while(0);
+#define TRACE_DATA_COPY(a, b, c)	do {} while(0);
+#define TRACE_START_DRIVER_COPY(a,b,c,d)	do {} while(0);
+#define TRACE_END_DRIVER_COPY(a,b,c,d)	do {} while(0);
+#define TRACE_WORK_STEALING(a, b)	do {} while(0);
+#define TRACE_WORKER_TERMINATED(a)	do {} while(0);
+#define TRACE_USER_DEFINED_START	do {} while(0);
+#define TRACE_USER_DEFINED_END		do {} while(0);
+#define TRACE_START_ALLOC(memnode)	do {} while(0);
+#define TRACE_END_ALLOC(memnode)	do {} while(0);
+#define TRACE_START_ALLOC_REUSE(a)	do {} while(0);
+#define TRACE_END_ALLOC_REUSE(a)	do {} while(0);
+#define TRACE_START_MEMRECLAIM(memnode)	do {} while(0);
+#define TRACE_END_MEMRECLAIM(memnode)	do {} while(0);
+
+#endif // USE_FXT
+
+#endif // __FXT_H__

+ 45 - 0
src/common/hash.c

@@ -0,0 +1,45 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/hash.h>
+
+#define CRC32C_POLY_BE 0x1EDC6F41
+
+static inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
+{
+	unsigned i;
+	uint32_t crc;
+
+	crc = inputcrc ^ (inputbyte << 24);
+	for (i = 0; i < 8; i++)
+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
+
+	return crc;
+}
+
+uint32_t crc32_be(uint32_t input, uint32_t inputcrc)
+{
+	uint8_t *p = (uint8_t *)&input;
+
+	uint32_t crc = inputcrc;
+
+	crc = crc32_be_8(p[0], crc);
+	crc = crc32_be_8(p[1], crc);
+	crc = crc32_be_8(p[2], crc);
+	crc = crc32_be_8(p[3], crc);
+
+	return crc;
+}

+ 24 - 0
src/common/hash.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __HASH_H__
+#define __HASH_H__
+
+#include <stdint.h>
+
+uint32_t crc32_be(uint32_t input, uint32_t inputcrc);
+
+#endif // __HASH_H__

+ 101 - 0
src/common/htable32.c

@@ -0,0 +1,101 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/htable32.h>
+#include <stdint.h>
+#include <string.h>
+
+void *htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key)
+{
+	unsigned currentbit;
+	unsigned keysize = 32;
+
+	htbl32_node_t *current_htbl = htbl;
+
+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
+	uint32_t mask = (1<<HTBL32_NODE_SIZE)-1;
+
+	for(currentbit = 0; currentbit < keysize; currentbit+=HTBL32_NODE_SIZE)
+	{
+	
+	//	printf("search : current bit = %d \n", currentbit);
+		if (current_htbl == NULL)
+			return NULL;
+
+		/* 0000000000001111 
+		 *     | currentbit
+		 * 0000111100000000 = offloaded_mask
+		 *         |last_currentbit
+		 * */
+
+		unsigned last_currentbit = 
+			keysize - (currentbit + HTBL32_NODE_SIZE);
+		uint32_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index = 
+			(key & (offloaded_mask)) >> (last_currentbit);
+
+		current_htbl = current_htbl->children[current_index];
+	}
+
+	return current_htbl;
+}
+
+/*
+ * returns the previous value of the tag, or NULL else
+ */
+
+void *htbl_insert_32(struct starpu_htbl32_node_s **htbl, uint32_t key, void *entry)
+{
+	unsigned currentbit;
+	unsigned keysize = 32;
+
+	htbl32_node_t **current_htbl_ptr = htbl;
+
+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
+	uint32_t mask = (1<<HTBL32_NODE_SIZE)-1;
+
+	for(currentbit = 0; currentbit < keysize; currentbit+=HTBL32_NODE_SIZE)
+	{
+		//printf("insert : current bit = %d \n", currentbit);
+		if (*current_htbl_ptr == NULL) {
+			/* TODO pad to change that 1 into 16 ? */
+			*current_htbl_ptr = calloc(sizeof(htbl32_node_t), 1);
+			assert(*current_htbl_ptr);
+		}
+
+		/* 0000000000001111 
+		 *     | currentbit
+		 * 0000111100000000 = offloaded_mask
+		 *         |last_currentbit
+		 * */
+
+		unsigned last_currentbit = 
+			keysize - (currentbit + HTBL32_NODE_SIZE);
+		uint32_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index = 
+			(key & (offloaded_mask)) >> (last_currentbit);
+
+		current_htbl_ptr = 
+			&((*current_htbl_ptr)->children[current_index]);
+	}
+
+	/* current_htbl either contains NULL or a previous entry 
+	 * we overwrite it anyway */
+	void *old_entry = *current_htbl_ptr;
+	*current_htbl_ptr = entry;
+
+	return old_entry;
+}

+ 35 - 0
src/common/htable32.h

@@ -0,0 +1,35 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __GENERIC_HTABLE_H__
+#define __GENERIC_HTABLE_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define HTBL32_NODE_SIZE	16
+
+typedef struct starpu_htbl32_node_s {
+	unsigned nentries;
+	struct starpu_htbl32_node_s *children[1<<HTBL32_NODE_SIZE];
+} htbl32_node_t;
+
+void *htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key);
+void *htbl_insert_32(struct starpu_htbl32_node_s **htbl, uint32_t key, void *entry);
+
+#endif // __GENERIC_HTABLE_H__

+ 167 - 0
src/common/list.h

@@ -0,0 +1,167 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/** @file
+ * @brief Listes doublement chainées automatiques
+ */
+
+
+/** @remarks list how-to
+ * *********************************************************
+ * LIST_TYPE(FOO, contenu);
+ *  - déclare les types suivants
+ *      + pour les cellules : FOO_t
+ *      + pour les listes : FOO_list_t
+ *      + pour les itérateurs : FOO_itor_t
+ *  - déclare les accesseurs suivants :
+ *     * création d'une cellule 
+ *   FOO_t      FOO_new(void);  
+ *     * suppression d'une cellule
+ *   void       FOO_delete(FOO_t); 
+ *     * création d'une liste (vide)
+ *   FOO_list_t FOO_list_new(void);
+ *     * suppression d'une liste
+ *   void       FOO_list_delete(FOO_list_t);
+ *     * teste si une liste est vide
+ *   int        FOO_list_empty(FOO_list_t);
+ *     * retire un élément de la liste
+ *   void       FOO_list_erase(FOO_list_t, FOO_t);
+ *     * ajoute une élément en queue de liste
+ *   void       FOO_list_push_back(FOO_list_t, FOO_t);
+ *     * ajoute un élément en tête de list
+ *   void       FOO_list_push_front(FOO_list_t, FOO_t);
+ *     * retire l'élément en queue de liste
+ *   FOO_t      FOO_list_pop_back(FOO_list_t);
+ *     * retire l'élement en tête de liste
+ *   FOO_t      FOO_list_pop_front(FOO_list_t);
+ *     * retourne l'élément en queue de liste
+ *   FOO_t      FOO_list_back(FOO_list_t);
+ *     * retourne l'élement en tête de liste
+ *   FOO_t      FOO_list_front(FOO_list_t);
+ *     * vérifie si la liste chainée est cohérente
+ *   int	FOO_list_check(FOO_list_t);
+ * *********************************************************
+ * Exemples d'utilisation :
+ *  - au départ, on a :
+ *    struct ma_structure_s
+ *    {
+ *      int a;
+ *      int b;
+ *    };
+ *  - on veut en faire une liste. On remplace la déclaration par :
+ *    LIST_TYPE(ma_structure,
+ *      int a;
+ *      int b;
+ *    );
+ *    qui crée les types ma_structure_t et ma_structure_list_t.
+ *  - allocation d'une liste vide :
+ *  ma_structure_list_t l = ma_structure_list_new();
+ *  - ajouter un élément 'e' en tête de la liste 'l' :
+ *  ma_structure_t e = ma_structure_new();
+ *  e->a = 0;
+ *  e->b = 1;
+ *  ma_structure_list_push_front(l, e);
+ *  - itérateur de liste :
+ *  ma_structure_itor_t i;
+ *  for(i  = ma_structure_list_begin(l);
+ *      i != ma_structure_list_end(l);
+ *      i  = ma_structure_list_next(i))
+ *  {
+ *    printf("a=%d; b=%d\n", i->a, i->b);
+ *  }
+ * *********************************************************
+ */
+
+
+
+/**@hideinitializer
+ * Generates a new type for list of elements */
+#define LIST_TYPE(ENAME, DECL) \
+  LIST_DECLARE_TYPE(ENAME) \
+  LIST_CREATE_TYPE(ENAME, DECL)
+
+/**@hideinitializer
+ * Forward type declaration for lists */
+#define LIST_DECLARE_TYPE(ENAME) \
+  /** automatic type: ENAME##_list_t is a list of ENAME##_t */ \
+  typedef struct ENAME##_list_s* ENAME##_list_t; \
+  /** automatic type: defines ENAME##_t */ \
+  typedef struct ENAME##_s* ENAME##_t; \
+  /** automatic type: ENAME##_itor_t is an iterator on lists of ENAME##_t */ \
+  typedef ENAME##_t ENAME##_itor_t;
+
+/**@hideinitializer
+ * The effective type declaration for lists */
+#define LIST_CREATE_TYPE(ENAME, DECL) \
+  /** from automatic type: ENAME##_t */ \
+  struct ENAME##_s \
+  { \
+    struct ENAME##_s*_prev; /**< @internal previous cell */ \
+    struct ENAME##_s*_next; /**< @internal next cell */ \
+    DECL \
+  }; \
+  /** @internal */ \
+  struct ENAME##_list_s \
+  { \
+    struct ENAME##_s* _head; /**< @internal head of the list */ \
+    struct ENAME##_s* _tail; /**< @internal tail of the list */ \
+  }; \
+  /** @internal */static inline ENAME##_t ENAME##_new(void) \
+    { ENAME##_t e = (ENAME##_t)malloc(sizeof(struct ENAME##_s)); \
+      e->_next = NULL; e->_prev = NULL; return e; } \
+  /** @internal */static inline void ENAME##_delete(ENAME##_t e) \
+    { free(e); } \
+  /** @internal */static inline void ENAME##_list_push_front(ENAME##_list_t l, ENAME##_t e) \
+    { if(l->_tail == NULL) l->_tail = e; else l->_head->_prev = e; \
+      e->_prev = NULL; e->_next = l->_head; l->_head = e; } \
+  /** @internal */static inline void ENAME##_list_push_back(ENAME##_list_t l, ENAME##_t e) \
+    { if(l->_head == NULL) l->_head = e; else l->_tail->_next = e; \
+      e->_next = NULL; e->_prev = l->_tail; l->_tail = e; } \
+  /** @internal */static inline ENAME##_t ENAME##_list_front(ENAME##_list_t l) \
+    { return l->_head; } \
+  /** @internal */static inline ENAME##_t ENAME##_list_back(ENAME##_list_t l) \
+    { return l->_tail; } \
+  /** @internal */static inline ENAME##_list_t ENAME##_list_new(void) \
+    { ENAME##_list_t l; l=(ENAME##_list_t)malloc(sizeof(struct ENAME##_list_s)); \
+      l->_head=NULL; l->_tail=l->_head; return l; } \
+  /** @internal */static inline int ENAME##_list_empty(ENAME##_list_t l) \
+    { return (l->_head == NULL); } \
+  /** @internal */static inline void ENAME##_list_delete(ENAME##_list_t l) \
+    { free(l); } \
+  /** @internal */static inline void ENAME##_list_erase(ENAME##_list_t l, ENAME##_t c) \
+    { ENAME##_t p = c->_prev; if(p) p->_next = c->_next; else l->_head = c->_next; \
+      if(c->_next) c->_next->_prev = p; else l->_tail = p; } \
+  /** @internal */static inline ENAME##_t ENAME##_list_pop_front(ENAME##_list_t l) \
+    { ENAME##_t e = ENAME##_list_front(l); \
+      ENAME##_list_erase(l, e); return e; } \
+  /** @internal */static inline ENAME##_t ENAME##_list_pop_back(ENAME##_list_t l) \
+    { ENAME##_t e = ENAME##_list_back(l); \
+      ENAME##_list_erase(l, e); return e; } \
+  /** @internal */static inline ENAME##_itor_t ENAME##_list_begin(ENAME##_list_t l) \
+    { return l->_head; } \
+  /** @internal */static inline ENAME##_itor_t ENAME##_list_end(ENAME##_list_t l __attribute__ ((unused))) \
+    { return NULL; } \
+  /** @internal */static inline ENAME##_itor_t ENAME##_list_next(ENAME##_itor_t i) \
+    { return i->_next; } \
+  /** @internal */static inline int ENAME##_list_size(ENAME##_list_t l) \
+    { ENAME##_itor_t i=l->_head; int k=0; while(i!=NULL){k++;i=i->_next;} return k; } \
+  /** @internal */static inline int ENAME##_list_check(ENAME##_list_t l) \
+    { ENAME##_itor_t i=l->_head; while(i) \
+    { if ((i->_next == NULL) && i != l->_tail) return 0; \
+      if (i->_next == i) return 0; \
+      i=i->_next;} return 1; }
+
+

+ 80 - 0
src/common/malloc.c

@@ -0,0 +1,80 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <errno.h>
+
+#include <core/workers.h>
+#include <common/config.h>
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+/* This method is not optimal at all, but it makes life much easier in many codes */
+
+#ifdef USE_CUDA
+struct data_interface_s;
+
+struct malloc_pinned_codelet_struct {
+	float **ptr;
+	size_t dim;
+};
+
+static void malloc_pinned_codelet(struct data_interface_s *buffers __attribute__((unused)), void *arg)
+{
+	struct malloc_pinned_codelet_struct *s = arg;
+
+	cuMemAllocHost((void **)(s->ptr), s->dim);
+}
+#endif
+
+void starpu_malloc_pinned_if_possible(float **A, size_t dim)
+{
+	if (may_submit_cuda_task())
+	{
+#ifdef USE_CUDA
+		int push_res;
+	
+		struct malloc_pinned_codelet_struct s = {
+			.ptr = A,
+			.dim = dim
+		};	
+	
+		starpu_codelet *cl = malloc(sizeof(starpu_codelet));
+			cl->cublas_func = malloc_pinned_codelet; 
+			cl->where = CUBLAS;
+			cl->model = NULL;
+			cl->nbuffers = 0;
+	
+		struct starpu_task *task = starpu_task_create();
+			task->callback_func = NULL; 
+			task->cl = cl;
+			task->cl_arg = &s;
+
+		task->synchronous = 1;
+	
+		push_res = starpu_submit_task(task);
+		STARPU_ASSERT(push_res != -ENODEV);
+
+		free(cl);
+		free(task);
+#endif
+	}
+	else {
+		*A = malloc(dim);
+	}
+}

+ 43 - 0
src/common/mutex.c

@@ -0,0 +1,43 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu-mutex.h>
+
+void init_mutex(starpu_mutex *m)
+{
+	/* this is free at first */
+	m->taken = 0;
+}
+
+inline int take_mutex_try(starpu_mutex *m)
+{
+	uint32_t prev;
+	prev = __sync_lock_test_and_set(&m->taken, 1);
+	return (prev == 0)?0:-1;
+}
+
+inline void take_mutex(starpu_mutex *m)
+{
+	uint32_t prev;
+	do {
+		prev = __sync_lock_test_and_set(&m->taken, 1);
+	} while (prev);
+}
+
+inline void release_mutex(starpu_mutex *m)
+{
+	m->taken = 0;
+}

+ 151 - 0
src/common/rwlock.c

@@ -0,0 +1,151 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/**
+ * A dummy implementation of a rw_lock using spinlocks ...
+ */ 
+
+#include "rwlock.h"
+
+static void _take_busy_lock(rw_lock *lock)
+{
+	uint32_t prev;
+	do {
+		prev = __sync_lock_test_and_set(&lock->busy, 1);
+	} while (prev);
+}
+
+static void _release_busy_lock(rw_lock *lock)
+{
+	lock->busy = 0;
+}
+
+void init_rw_lock(rw_lock *lock)
+{
+	STARPU_ASSERT(lock);
+
+	lock->writer = 0;
+	lock->readercnt = 0;
+	lock->busy = 0;
+}
+
+
+int take_rw_lock_write_try(rw_lock *lock)
+{
+	_take_busy_lock(lock);
+	
+	if (lock->readercnt > 0 || lock->writer)
+	{
+		/* fail to take the lock */
+		_release_busy_lock(lock);
+		return -1;
+	}
+	else {
+		STARPU_ASSERT(lock->readercnt == 0);
+		STARPU_ASSERT(lock->writer == 0);
+
+		/* no one was either writing nor reading */
+		lock->writer = 1;
+		_release_busy_lock(lock);
+		return 0;
+	}
+}
+
+int take_rw_lock_read_try(rw_lock *lock)
+{
+	_take_busy_lock(lock);
+
+	if (lock->writer)
+	{
+		/* there is a writer ... */
+		_release_busy_lock(lock);
+		return -1;
+	}
+	else {
+		STARPU_ASSERT(lock->writer == 0);
+
+		/* no one is writing */
+		/* XXX check wrap arounds ... */
+		lock->readercnt++;
+		_release_busy_lock(lock);
+
+		return 0;
+	}
+}
+
+
+
+void take_rw_lock_write(rw_lock *lock)
+{
+	do {
+		_take_busy_lock(lock);
+		
+		if (lock->readercnt > 0 || lock->writer)
+		{
+			/* fail to take the lock */
+			_release_busy_lock(lock);
+		}
+		else {
+			STARPU_ASSERT(lock->readercnt == 0);
+			STARPU_ASSERT(lock->writer == 0);
+	
+			/* no one was either writing nor reading */
+			lock->writer = 1;
+			_release_busy_lock(lock);
+			return;
+		}
+	} while (1);
+}
+
+void take_rw_lock_read(rw_lock *lock)
+{
+	do {
+		_take_busy_lock(lock);
+
+		if (lock->writer)
+		{
+			/* there is a writer ... */
+			_release_busy_lock(lock);
+		}
+		else {
+			STARPU_ASSERT(lock->writer == 0);
+
+			/* no one is writing */
+			/* XXX check wrap arounds ... */
+			lock->readercnt++;
+			_release_busy_lock(lock);
+
+			return;
+		}
+	} while (1);
+}
+
+void release_rw_lock(rw_lock *lock)
+{
+	_take_busy_lock(lock);
+	/* either writer or reader (exactly one !) */
+	if (lock->writer) 
+	{
+		STARPU_ASSERT(lock->readercnt == 0);
+		lock->writer = 0;
+	}
+	else {
+		/* reading mode */
+		STARPU_ASSERT(lock->writer == 0);
+		lock->readercnt--;
+	}
+	_release_busy_lock(lock);
+}

+ 40 - 0
src/common/rwlock.h

@@ -0,0 +1,40 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __RWLOCKS_H__
+#define __RWLOCKS_H__
+
+#include <stdint.h>
+#include <starpu.h>
+
+typedef struct rw_lock_t {
+	uint32_t busy;
+	uint8_t writer;
+	uint16_t readercnt;
+} rw_lock;
+
+void init_rw_lock(rw_lock *lock);
+void take_rw_lock_write(rw_lock *lock);
+void take_rw_lock_read(rw_lock *lock);
+int take_rw_lock_write_try(rw_lock *lock);
+int take_rw_lock_read_try(rw_lock *lock);
+void release_rw_lock(rw_lock *lock);
+
+///* make sure to have the lock before using that function */
+//inline uint8_t rw_lock_is_writer(rw_lock *lock);
+//unsigned is_rw_lock_referenced(rw_lock *lock);
+
+#endif

+ 144 - 0
src/common/timing.c

@@ -0,0 +1,144 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "timing.h"
+
+#ifdef UNRELIABLETICKS
+
+#define TICK_RAW_DIFF(t1, t2) (((t2).tv.tv_sec*1e6 + (t2).tv.tv_usec) + \
+				- ((t1).tv.tv_sec*1e6) + (t1).tv.tv_usec)
+#define TICK_DIFF(t1, t2) (TICK_RAW_DIFF(t1, t2) - residual)
+#define TIMING_DELAY(t1, t2) tick2usec(TICK_DIFF(t1, t2))
+
+static double scale = 0.0;
+static unsigned long long residual = 0;
+
+static int inited = 0;
+
+void timing_init(void)
+{
+  static tick_t t1, t2;
+  int i;
+
+  if (inited) return;
+
+  residual = (unsigned long long)1 << 63;
+  
+  for(i = 0; i < 20; i++)
+    {
+      GET_TICK(t1);
+      GET_TICK(t2);
+      residual = STARPU_MIN(residual, TICK_RAW_DIFF(t1, t2));
+    }
+  
+  {
+    struct timeval tv1,tv2;
+    
+    GET_TICK(t1);
+    gettimeofday(&tv1,0);
+    usleep(500000);
+    GET_TICK(t2);
+    gettimeofday(&tv2,0);
+    scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -
+	     (tv1.tv_sec*1e6 + tv1.tv_usec)) / 
+      (double)(TICK_DIFF(t1, t2));
+  }
+
+  inited = 1;
+}
+
+inline double tick2usec(long long t)
+{
+  return (double)(t)*scale;
+}
+
+inline double timing_delay(tick_t *t1, tick_t *t2)
+{
+	return TIMING_DELAY(*t1, *t2);
+}
+
+inline double timing_now(void)
+{
+	tick_t tick_now;
+	GET_TICK(tick_now);
+
+	return tick2usec(scale*((tick_now).tv.tv_sec*1e6) + (tick_now).tv.tv_usec);
+}
+
+
+
+#else // UNRELIABLETICKS
+
+#define TICK_RAW_DIFF(t1, t2) ((t2).tick - (t1).tick)
+#define TICK_DIFF(t1, t2) (TICK_RAW_DIFF(t1, t2) - residual)
+#define TIMING_DELAY(t1, t2) tick2usec(TICK_DIFF(t1, t2))
+
+static double scale = 0.0;
+static unsigned long long residual = 0;
+
+static int inited = 0;
+
+void timing_init(void)
+{
+  static tick_t t1, t2;
+  int i;
+
+  if (inited) return;
+
+  residual = (unsigned long long)1 << 63;
+  
+  for(i = 0; i < 20; i++)
+    {
+      GET_TICK(t1);
+      GET_TICK(t2);
+      residual = STARPU_MIN(residual, TICK_RAW_DIFF(t1, t2));
+    }
+  
+  {
+    struct timeval tv1,tv2;
+    
+    GET_TICK(t1);
+    gettimeofday(&tv1,0);
+    usleep(500000);
+    GET_TICK(t2);
+    gettimeofday(&tv2,0);
+    scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -
+	     (tv1.tv_sec*1e6 + tv1.tv_usec)) / 
+      (double)(TICK_DIFF(t1, t2));
+  }
+
+  inited = 1;
+}
+
+inline double tick2usec(long long t)
+{
+  return (double)(t)*scale;
+}
+
+inline double timing_delay(tick_t *t1, tick_t *t2)
+{
+	return TIMING_DELAY(*t1, *t2);
+}
+
+inline double timing_now(void)
+{
+	tick_t tick_now;
+	GET_TICK(tick_now);
+
+	return tick2usec(tick_now.tick);
+}
+
+#endif // UNRELIABLETICKS

+ 77 - 0
src/common/timing.h

@@ -0,0 +1,77 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef TIMING_H
+#define TIMING_H
+
+/*
+ * -- Initialiser la bibliothèque avec timing_init();
+ * -- Mémoriser un timestamp :
+ *  tick_t t;
+ *  GET_TICK(t);
+ * -- Calculer un intervalle en microsecondes :
+ *  TIMING_DELAY(t1, t2);
+ */
+
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <starpu.h>
+
+#ifdef UNRELIABLETICKS
+
+/* we use the usual gettimeofday method */
+typedef struct tick_s
+{
+	struct timeval tv;
+} tick_t;
+
+#define GET_TICK(t) gettimeofday(&((t).tv), NULL)
+
+#else // !UNRELIABLETICKS
+
+typedef union u_tick
+{
+  uint64_t tick;
+
+  struct
+  {
+    uint32_t low;
+    uint32_t high;
+  }
+  sub;
+} tick_t;
+
+#if defined(__i386__) || defined(__pentium__) || defined(__pentiumpro__) || defined(__i586__) || defined(__i686__) || defined(__k6__) || defined(__k7__) || defined(__x86_64__)
+#  define GET_TICK(t) __asm__ volatile("rdtsc" : "=a" ((t).sub.low), "=d" ((t).sub.high))
+#else
+//#  error "Processeur non-supporté par timing.h"
+/* XXX */
+//#warning "unsupported processor GET_TICK returns 0"
+#  define GET_TICK(t) do {} while(0);
+#endif
+
+#endif // UNRELIABLETICKS
+
+void __attribute__ ((unused)) timing_init(void);
+inline double __attribute__ ((unused)) tick2usec(long long t);
+inline double __attribute__ ((unused)) timing_delay(tick_t *t1, tick_t *t2);
+
+inline double __attribute__ ((unused)) timing_now(void);
+
+#endif /* TIMING_H */
+
+

+ 227 - 0
src/core/dependencies/data-concurrency.c

@@ -0,0 +1,227 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/dependencies/data-concurrency.h>
+#include <datawizard/coherency.h>
+#include <core/policies/sched_policy.h>
+
+#ifdef NO_DATA_RW_LOCK
+
+static unsigned _submit_job_enforce_data_deps(job_t j, unsigned start_buffer_index);
+
+static unsigned unlock_one_requester(data_requester_t r)
+{
+	job_t j = r->j;
+	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned buffer_index = r->buffer_index;
+
+	if (buffer_index + 1 < nbuffers)
+	{
+		/* not all buffers are protected yet */
+		return _submit_job_enforce_data_deps(j, buffer_index + 1);
+	}
+	else
+		return 0;
+}
+
+/* the header lock must be taken by the caller */
+static unsigned may_unlock_data_req_list_head(data_state *data)
+{
+	/* if there is no one to unlock ... */
+	if (data_requester_list_empty(data->req_list))
+		return 0;
+
+	/* if there is no reference to the data anymore, we can use it */
+	if (data->refcnt == 0)
+		return 1;
+
+	if (data->current_mode == W)
+		return 0;
+
+	/* data->current_mode == R, so we can process more readers */
+	data_requester_t r = data_requester_list_front(data->req_list);
+	
+	return (r->mode == R);
+}
+
+
+unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_access_mode mode,
+						void (*callback)(void *), void *argcb)
+{
+	unsigned ret;
+
+	take_mutex(&data->header_lock);
+
+	if (data->refcnt == 0)
+	{
+		/* there is nobody currently about to manipulate the data */
+		data->refcnt++;
+		data->current_mode = mode;
+
+		/* success */
+		ret = 0;
+	}
+	else
+	{
+		/* there is already someone that may access the data */
+		if ( (mode == R) && (data->current_mode == R))
+		{
+			data->refcnt++;
+
+			/* success : there is a new reader */
+			ret = 0;
+		}
+		else
+		{
+			/* there cannot be multiple writers or a new writer
+			 * while the data is in read mode */
+			
+			/* enqueue the request */
+			data_requester_t r = data_requester_new();
+				r->mode = mode;
+				r->is_requested_by_codelet = 0;
+				r->ready_data_callback = callback;
+				r->argcb = argcb;
+
+			data_requester_list_push_back(data->req_list, r);
+
+			/* failed */
+			ret = 1;
+		}
+	}
+
+	release_mutex(&data->header_lock);
+	return ret;
+}
+
+static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer_index)
+{
+	unsigned ret;
+
+	data_state *data = j->task->buffers[buffer_index].state;
+	starpu_access_mode mode = j->task->buffers[buffer_index].mode;
+
+	take_mutex(&data->header_lock);
+
+	if (data->refcnt == 0)
+	{
+		/* there is nobody currently about to manipulate the data */
+		data->refcnt++;
+		data->current_mode = mode;
+
+		/* success */
+		ret = 0;
+	}
+	else
+	{
+		/* there is already someone that may access the data */
+		if ( (mode == R) && (data->current_mode == R))
+		{
+			data->refcnt++;
+
+			/* success : there is a new reader */
+			ret = 0;
+		}
+		else
+		{
+			/* there cannot be multiple writers or a new writer
+			 * while the data is in read mode */
+			
+			/* enqueue the request */
+			data_requester_t r = data_requester_new();
+				r->mode = mode;
+				r->is_requested_by_codelet = 1;
+				r->j = j;
+				r->buffer_index = buffer_index;
+
+			data_requester_list_push_back(data->req_list, r);
+
+			/* failed */
+			ret = 1;
+		}
+	}
+
+	release_mutex(&data->header_lock);
+	return ret;
+}
+
+static unsigned _submit_job_enforce_data_deps(job_t j, unsigned start_buffer_index)
+{
+	unsigned buf;
+
+	/* TODO compute an ordered list of the data */
+
+	unsigned nbuffers = j->task->cl->nbuffers;
+	for (buf = start_buffer_index; buf < nbuffers; buf++)
+	{
+		if (attempt_to_submit_data_request_from_job(j, buf))
+			return 1;
+	}
+
+	return 0;
+}
+
+/* When a new task is submitted, we make sure that there cannot be codelets
+   with concurrent data-access at the same time in the scheduling engine (eg.
+   there can be 2 tasks reading a piece of data, but there cannot be one
+   reading and another writing) */
+unsigned submit_job_enforce_data_deps(job_t j)
+{
+	if (j->task->cl->nbuffers == 0)
+		return 0;
+
+	return _submit_job_enforce_data_deps(j, 0);
+}
+
+
+void notify_data_dependencies(data_state *data)
+{
+	take_mutex(&data->header_lock);
+
+	data->refcnt--;
+
+	while (may_unlock_data_req_list_head(data))
+	{
+		/* unlock the head of the requester list */
+		data_requester_t r = data_requester_list_pop_front(data->req_list);
+
+		data->refcnt++;
+	
+		release_mutex(&data->header_lock);
+
+		if (r->is_requested_by_codelet)
+		{
+			if (!unlock_one_requester(r))
+				push_task(r->j);
+		}
+		else
+		{
+			STARPU_ASSERT(r->ready_data_callback);
+
+			/* execute the callback associated with the data requester */
+			r->ready_data_callback(r->argcb);
+		}
+
+		data_requester_delete(r);
+		
+		take_mutex(&data->header_lock);
+	}
+	
+	release_mutex(&data->header_lock);
+
+}
+
+#endif

+ 33 - 0
src/core/dependencies/data-concurrency.h

@@ -0,0 +1,33 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATA_CONCURRENCY_H__
+#define __DATA_CONCURRENCY_H__
+
+#include <core/jobs.h>
+
+#ifdef NO_DATA_RW_LOCK
+
+unsigned submit_job_enforce_data_deps(job_t j);
+
+void notify_data_dependencies(data_state *data);
+
+unsigned attempt_to_submit_data_request_from_apps(data_state *state, starpu_access_mode mode,
+						void (*callback)(void *), void *argcb);
+#endif
+
+#endif // __DATA_CONCURRENCY_H__
+

+ 174 - 0
src/core/dependencies/htable.c

@@ -0,0 +1,174 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/dependencies/htable.h>
+#include <string.h>
+
+void *htbl_search_tag(htbl_node_t *htbl, starpu_tag_t tag)
+{
+	unsigned currentbit;
+	htbl_node_t *current_htbl = htbl;
+
+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
+
+	for(currentbit = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE)
+	{
+	
+	//	printf("search : current bit = %d \n", currentbit);
+		if (STARPU_UNLIKELY(current_htbl == NULL))
+			return NULL;
+
+		/* 0000000000001111 
+		 *     | currentbit
+		 * 0000111100000000 = offloaded_mask
+		 *         |last_currentbit
+		 * */
+
+		unsigned last_currentbit = 
+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
+		starpu_tag_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index = 
+			(tag & (offloaded_mask)) >> (last_currentbit);
+
+		current_htbl = current_htbl->children[current_index];
+	}
+
+	return current_htbl;
+}
+
+/*
+ * returns the previous value of the tag, or NULL else
+ */
+
+void *htbl_insert_tag(htbl_node_t **htbl, starpu_tag_t tag, void *entry)
+{
+
+	unsigned currentbit;
+	htbl_node_t **current_htbl_ptr = htbl;
+	htbl_node_t *previous_htbl_ptr = NULL;
+
+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
+
+	for(currentbit = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE)
+	{
+		if (*current_htbl_ptr == NULL) {
+			/* TODO pad to change that 1 into 16 ? */
+			*current_htbl_ptr = calloc(1, sizeof(htbl_node_t));
+			assert(*current_htbl_ptr);
+
+			if (previous_htbl_ptr)
+				previous_htbl_ptr->nentries++;
+		}
+
+		/* 0000000000001111 
+		 *     | currentbit
+		 * 0000111100000000 = offloaded_mask
+		 *         |last_currentbit
+		 * */
+
+		unsigned last_currentbit = 
+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
+		starpu_tag_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index = 
+			(tag & (offloaded_mask)) >> (last_currentbit);
+
+		previous_htbl_ptr = *current_htbl_ptr;
+		current_htbl_ptr = 
+			&((*current_htbl_ptr)->children[current_index]);
+
+	}
+
+	/* current_htbl either contains NULL or a previous entry 
+	 * we overwrite it anyway */
+	void *old_entry = *current_htbl_ptr;
+	*current_htbl_ptr = entry;
+
+	if (!old_entry)
+		previous_htbl_ptr->nentries++;
+
+	return old_entry;
+}
+
+/* returns the entry corresponding to the tag and remove it from the htbl */
+void *htbl_remove_tag(htbl_node_t *htbl, starpu_tag_t tag)
+{
+	/* NB : if the entry is "NULL", we assume this means it is not present XXX */
+	unsigned currentbit;
+	htbl_node_t *current_htbl_ptr = htbl;
+
+	/* remember the path to the tag */
+	htbl_node_t *path[(TAG_SIZE + HTBL_NODE_SIZE - 1)/(HTBL_NODE_SIZE)];
+
+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
+	int level, maxlevel;
+	unsigned tag_is_present = 1;
+
+	for(currentbit = 0, level = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE, level++)
+	{
+		path[level] = current_htbl_ptr;
+
+		if (STARPU_UNLIKELY(!current_htbl_ptr)) {
+			tag_is_present = 0;
+			break;
+		}
+
+		/* 0000000000001111 
+		 *     | currentbit
+		 * 0000111100000000 = offloaded_mask
+		 *         |last_currentbit
+		 * */
+
+		unsigned last_currentbit = 
+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
+		starpu_tag_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index = 
+			(tag & (offloaded_mask)) >> (last_currentbit);
+		
+		current_htbl_ptr = 
+			current_htbl_ptr->children[current_index];
+	}
+
+	maxlevel = level;
+	if (STARPU_UNLIKELY(!current_htbl_ptr))
+		tag_is_present = 0;
+
+	void *old_entry = current_htbl_ptr;
+
+	if (tag_is_present) {
+		/* the tag was in the htbl, so we have to unroll the search 
+ 		 * to remove possibly useless htbl (internal) nodes */
+		for (level = maxlevel - 1; level >= 0; level--)
+		{
+			path[level]->nentries--;
+
+			/* TODO use likely statements ... */
+
+			/* in case we do not remove that node, we do decrease its parents
+ 			 * number of entries */
+			if (path[level]->nentries > 0)
+				break;
+
+			/* we remove this node */
+			free(path[level]);
+		}
+	}
+
+	/* we return the entry if there was one */
+	return old_entry;
+}

+ 42 - 0
src/core/dependencies/htable.h

@@ -0,0 +1,42 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __HTABLE_H__
+#define __HTABLE_H__
+
+/*
+ *	Define a hierarchical table to do the tag matching
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <core/dependencies/tags.h>
+
+#define HTBL_NODE_SIZE	16
+
+typedef struct _htbl_node_t {
+	unsigned nentries;
+	struct _htbl_node_t *children[1<<HTBL_NODE_SIZE];
+} htbl_node_t;
+
+void *htbl_search_tag(htbl_node_t *htbl, starpu_tag_t tag);
+void *htbl_insert_tag(htbl_node_t **htbl, starpu_tag_t tag, void *entry);
+void *htbl_remove_tag(htbl_node_t *htbl, starpu_tag_t tag);
+
+
+#endif

+ 266 - 0
src/core/dependencies/tags.c

@@ -0,0 +1,266 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <core/dependencies/tags.h>
+#include <core/dependencies/htable.h>
+#include <core/jobs.h>
+#include <core/policies/sched_policy.h>
+#include <core/dependencies/data-concurrency.h>
+#include <starpu.h>
+
+static htbl_node_t *tag_htbl = NULL;
+static starpu_mutex tag_mutex = {
+	.taken = 0
+};
+
+cg_t *create_cg(unsigned ntags, struct tag_s *tag)
+{
+	cg_t *cg;
+
+	cg = malloc(sizeof(cg_t));
+	STARPU_ASSERT(cg);
+	if (cg) {
+		cg->ntags = ntags;
+		cg->tag = tag;
+	}
+
+	return cg;
+}
+
+static struct tag_s *tag_init(starpu_tag_t id)
+{
+	struct tag_s *tag;
+	tag = malloc(sizeof(struct tag_s));
+	STARPU_ASSERT(tag);
+
+	tag->id = id;
+	tag->state = UNASSIGNED;
+	tag->nsuccs = 0;
+
+#ifdef DYNAMIC_DEPS_SIZE
+	/* this is a small initial default value ... may be changed */
+	tag->succ_list_size = 4;
+	tag->succ = realloc(NULL, tag->succ_list_size*sizeof(struct _cg_t *));
+#endif
+
+	init_mutex(&tag->lock);
+
+	tag->job = NULL;
+
+	return tag;
+}
+
+void starpu_tag_remove(starpu_tag_t id)
+{
+	struct tag_s *tag;
+
+	take_mutex(&tag_mutex);
+	tag = htbl_remove_tag(tag_htbl, id);
+	
+#ifdef DYNAMIC_DEPS_SIZE
+	if (tag)
+		free(tag->succ);
+#endif
+
+	release_mutex(&tag_mutex);
+
+	free(tag);
+}
+
+struct tag_s *gettag_struct(starpu_tag_t id)
+{
+	take_mutex(&tag_mutex);
+
+	/* search if the tag is already declared or not */
+	struct tag_s *tag;
+	tag = htbl_search_tag(tag_htbl, id);
+
+	if (tag == NULL) {
+		/* the tag does not exist yet : create an entry */
+		tag = tag_init(id);
+
+		void *old;
+		old = htbl_insert_tag(&tag_htbl, id, tag);
+		/* there was no such tag before */
+		STARPU_ASSERT(old == NULL);
+	}
+
+	release_mutex(&tag_mutex);
+
+	return tag;
+}
+
+void notify_cg(cg_t *cg)
+{
+
+	STARPU_ASSERT(cg);
+	unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
+	if (ntags == 0) {
+		/* the group is now completed */
+		tag_set_ready(cg->tag);
+		free(cg);
+	}
+}
+
+void tag_add_succ(starpu_tag_t id, cg_t *cg)
+{
+	/* find out the associated structure */
+	struct tag_s *tag = gettag_struct(id);
+	STARPU_ASSERT(tag);
+
+	take_mutex(&tag->lock);
+
+	if (tag->state == DONE) {
+		/* the tag was already completed sooner */
+		notify_cg(cg);
+	}
+	else {
+		/* where should that cg should be put in the array ? */
+		unsigned index = STARPU_ATOMIC_ADD(&tag->nsuccs, 1) - 1;
+
+#ifdef DYNAMIC_DEPS_SIZE
+		if (index >= tag->succ_list_size)
+		{
+			/* the successor list is too small */
+			tag->succ_list_size *= 2;
+
+			/* NB: this is thread safe as the tag->lock is taken */
+			tag->succ = realloc(tag->succ, 
+				tag->succ_list_size*sizeof(struct _cg_t *));
+		}
+#else
+		STARPU_ASSERT(index < NMAXDEPS);
+#endif
+
+		tag->succ[index] = cg;
+	}
+
+	release_mutex(&tag->lock);
+}
+
+void notify_dependencies(struct job_s *j)
+{
+	struct tag_s *tag;
+	unsigned nsuccs;
+	unsigned succ;
+
+	STARPU_ASSERT(j);
+	
+	if (j->task->use_tag) {
+		/* in case there are dependencies, wake up the proper tasks */
+		tag = j->tag;
+
+		tag->state = DONE;
+		TRACE_TASK_DONE(tag->id);
+
+		nsuccs = tag->nsuccs;
+		for (succ = 0; succ < nsuccs; succ++)
+		{
+			notify_cg(tag->succ[succ]);
+		}
+	}
+}
+
+void tag_declare(starpu_tag_t id, struct job_s *job)
+{
+	TRACE_CODELET_TAG(id, job);
+	job->task->use_tag = 1;
+	
+	struct tag_s *tag= gettag_struct(id);
+	tag->job = job;
+	
+	job->tag = tag;
+}
+
+void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array)
+{
+	unsigned i;
+
+	/* create the associated completion group */
+	struct tag_s *tag_child = gettag_struct(id);
+	cg_t *cg = create_cg(ndeps, tag_child);
+	
+	tag_child->state = BLOCKED;
+	
+	STARPU_ASSERT(ndeps != 0);
+	
+	for (i = 0; i < ndeps; i++)
+	{
+		starpu_tag_t dep_id = array[i];
+		
+		/* id depends on dep_id
+		 * so cg should be among dep_id's successors*/
+		TRACE_CODELET_TAG_DEPS(id, dep_id);
+		tag_add_succ(dep_id, cg);
+	}
+}
+
+void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
+{
+	unsigned i;
+	
+	/* create the associated completion group */
+	struct tag_s *tag_child = gettag_struct(id);
+	cg_t *cg = create_cg(ndeps, tag_child);
+	
+	tag_child->state = BLOCKED;
+	
+	STARPU_ASSERT(ndeps != 0);
+	
+	va_list pa;
+	va_start(pa, ndeps);
+	for (i = 0; i < ndeps; i++)
+	{
+		starpu_tag_t dep_id;
+		dep_id = va_arg(pa, starpu_tag_t);
+		
+		/* id depends on dep_id
+		 * so cg should be among dep_id's successors*/
+		TRACE_CODELET_TAG_DEPS(id, dep_id);
+		tag_add_succ(dep_id, cg);
+	}
+	va_end(pa);
+}
+
+void tag_set_ready(struct tag_s *tag)
+{
+	/* mark this tag as ready to run */
+	tag->state = READY;
+	/* declare it to the scheduler ! */
+	struct job_s *j = tag->job;
+
+	/* perhaps the corresponding task was not declared yet */
+	if (!j)
+		return;
+
+#ifdef NO_DATA_RW_LOCK
+	/* enforce data dependencies */
+	if (submit_job_enforce_data_deps(j))
+		return;
+#endif
+
+	push_task(j);
+}
+
+/* This function is called when a new task is submitted to StarPU 
+ * it returns 1 if the task deps are not fulfilled, 0 otherwise */
+unsigned submit_job_enforce_task_deps(job_t j)
+{
+	struct tag_s *tag = j->tag;
+	return (tag->state == BLOCKED);
+}

+ 78 - 0
src/core/dependencies/tags.h

@@ -0,0 +1,78 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __TAGS_H__
+#define __TAGS_H__
+
+#include <stdint.h>
+#include <starpu-mutex.h>
+#include <core/jobs.h>
+
+/* we do not necessarily want to allocate room for 256 dependencies, but we
+   want to handle the few situation where there are a lot of dependencies as
+   well */
+#define DYNAMIC_DEPS_SIZE	1
+
+/* randomly choosen ! */
+#ifndef DYNAMIC_DEPS_SIZE
+#define NMAXDEPS	256
+#endif
+
+#define TAG_SIZE        (sizeof(starpu_tag_t)*8)
+
+typedef enum {
+	UNASSIGNED,
+	DONE,
+	READY,
+	SCHEDULED,
+	BLOCKED
+} tag_state;
+
+struct job_s;
+
+struct tag_s {
+	starpu_mutex lock; /* do we really need that ? */
+	starpu_tag_t id; /* an identifier for the task */
+	tag_state state;
+	unsigned nsuccs; /* how many successors ? */
+#ifdef DYNAMIC_DEPS_SIZE
+	unsigned succ_list_size;
+	struct _cg_t **succ;
+#else
+	struct _cg_t *succ[NMAXDEPS];
+#endif
+	struct job_s *job; /* which job is associated to the tag if any ? */
+};
+
+typedef struct _cg_t {
+	unsigned ntags; /* number of remaining tags */
+	struct tag_s *tag; /* which tags depends on that cg ?  */
+} cg_t;
+
+void notify_cg(cg_t *cg);
+void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
+
+cg_t *create_cg(unsigned ntags, struct tag_s *tag);
+struct tag_s *get_tag_struct(starpu_tag_t id);
+void tag_add_succ(starpu_tag_t id, cg_t *cg);
+
+void notify_dependencies(struct job_s *j);
+void tag_declare(starpu_tag_t id, struct job_s *job);
+void tag_set_ready(struct tag_s *tag);
+
+unsigned submit_job_enforce_task_deps(struct job_s *j);
+
+#endif // __TAGS_H__

+ 184 - 0
src/core/jobs.c

@@ -0,0 +1,184 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <core/dependencies/data-concurrency.h>
+#include <common/config.h>
+
+size_t job_get_data_size(job_t j)
+{
+	size_t size = 0;
+
+	struct starpu_task *task = j->task;
+
+	unsigned nbuffers = task->cl->nbuffers;
+
+	unsigned buffer;
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		data_state *state = task->buffers[buffer].state;
+		size += state->ops->get_size(state);
+	}
+
+	return size;
+}
+
+/* create an internal job_t structure to encapsulate the task */
+job_t __attribute__((malloc)) job_create(struct starpu_task *task)
+{
+	job_t job;
+
+	job = job_new();
+
+	job->task = task;
+
+	job->predicted = 0.0;
+	job->footprint_is_computed = 0;
+	job->terminated = 0;
+
+	if (task->synchronous)
+		sem_init(&job->sync_sem, 0, 0);
+
+	if (task->use_tag)
+		tag_declare(task->tag_id, job);
+
+	return job;
+}
+
+struct starpu_task * __attribute__((malloc)) starpu_task_create(void)
+{
+	struct starpu_task *task;
+
+	task = calloc(1, sizeof(struct starpu_task));
+	STARPU_ASSERT(task);
+
+	task->priority = DEFAULT_PRIO;
+
+	return task;
+}
+
+void handle_job_termination(job_t j)
+{
+	struct starpu_task *task = j->task;
+
+	if (STARPU_UNLIKELY(j->terminated))
+		fprintf(stderr, "OOPS ... job %p was already terminated !!\n", j);
+
+	j->terminated = 1;
+
+	/* in case there are dependencies, wake up the proper tasks */
+	notify_dependencies(j);
+
+	/* the callback is executed after the dependencies so that we may remove the tag 
+ 	 * of the task itself */
+	if (task->callback_func)
+	{
+		TRACE_START_CALLBACK(j);
+		task->callback_func(task->callback_arg);
+		TRACE_END_CALLBACK(j);
+	}
+
+	if (task->synchronous)
+	{
+		if (sem_post(&j->sync_sem))
+			perror("sem_post");
+
+		/* as this is a synchronous task, we do not delete the job 
+		   structure which contains the j->sync_sem: we only liberate
+		   it once the semaphore is destroyed */
+	}
+	else
+	{
+		job_delete(j);
+	}
+
+}
+
+static void block_sync_task(job_t j)
+{
+	{
+		sem_wait(&j->sync_sem);
+		sem_destroy(&j->sync_sem);
+
+		/* as this is a synchronous task, the liberation of the job
+		   structure was deferred */
+		job_delete(j);
+	}
+}
+
+/* application should submit new tasks to StarPU through this function */
+int starpu_submit_task(struct starpu_task *task)
+{
+	int ret;
+	unsigned is_sync = task->synchronous;
+
+	STARPU_ASSERT(task);
+
+	if (!worker_exists(task->cl->where))
+		return -ENODEV;
+
+	/* internally, StarPU manipulates a job_t which is a wrapper around a
+ 	* task structure */
+	job_t j = job_create(task);
+
+	/* enfore task dependencies */
+	if (task->use_tag)
+	{
+		if (submit_job_enforce_task_deps(j))
+		{
+			if (is_sync)
+				block_sync_task(j);
+			return 0;
+		}
+	}
+
+#ifdef NO_DATA_RW_LOCK
+	/* enforce data dependencies */
+	if (submit_job_enforce_data_deps(j))
+	{
+		if (is_sync)
+			block_sync_task(j);
+		return 0;
+	}
+#endif
+
+	ret = push_task(j);
+
+	if (is_sync)
+		block_sync_task(j);
+
+	return ret;
+}
+
+//int submit_prio_job(job_t j)
+//{
+//	j->priority = MAX_PRIO;
+//	
+//	return submit_job(j);
+//}
+
+/* This function is supplied for convenience only, it is equivalent to setting
+ * the proper flag and submitting the task with submit_task.
+ * Note that this call is blocking, and will not make StarPU progress,
+ * so it must only be called from the programmer thread, not by StarPU.
+ * NB: This also means that it cannot be submitted within a callback ! */
+int submit_sync_task(struct starpu_task *task)
+{
+	task->synchronous = 1;
+
+	return starpu_submit_task(task);
+}

+ 80 - 0
src/core/jobs.h

@@ -0,0 +1,80 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __JOBS_H__
+#define __JOBS_H__
+
+#include <starpu.h>
+#include <semaphore.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <pthread.h>
+#include <common/config.h>
+#include <common/timing.h>
+#include <common/list.h>
+#include <common/fxt.h>
+
+#include <core/dependencies/tags.h>
+
+#include <datawizard/datawizard.h>
+
+#include <core/perfmodel/perfmodel.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+/* codelet function */
+typedef void (*cl_func)(starpu_data_interface_t *, void *);
+typedef void (*callback)(void *);
+
+#define CORE_MAY_PERFORM(j)	((j)->task->cl->where & CORE)
+#define CUDA_MAY_PERFORM(j)     ((j)->task->cl->where & CUDA)
+#define CUBLAS_MAY_PERFORM(j)   ((j)->task->cl->where & CUBLAS)
+#define SPU_MAY_PERFORM(j)	((j)->task->cl->where & SPU)
+#define GORDON_MAY_PERFORM(j)	((j)->task->cl->where & GORDON)
+
+/* a job is the internal representation of a task */
+LIST_TYPE(job,
+	struct starpu_task *task;
+
+	sem_t sync_sem;
+
+	struct tag_s *tag;
+
+	double predicted;
+	double penality;
+
+	unsigned footprint_is_computed;
+	uint32_t footprint;
+
+	unsigned terminated;
+);
+
+//#warning this must not be exported anymore ... 
+//job_t job_create(struct starpu_task *task);
+void handle_job_termination(job_t j);
+size_t job_get_data_size(job_t j);
+
+//int submit_job(job_t j);
+//int submit_prio_job(job_t j);
+//int submit_job_sync(job_t j);
+
+#endif // __JOBS_H__

+ 21 - 0
src/core/mechanisms/TODO

@@ -0,0 +1,21 @@
+queue design :
+	- create_central_jobq		-> trivial single list
+	- create_per_accelerator_jobq	-> one core = one queue (cilk-like)
+	- create_hierarchical_jobq	-> marcel-like
+
+remarks:
+	- a queue may be a set of queue (eg. for priority queues)
+	- queues may have a limited size so that an extra central queue 
+	  could be needed in that case ...
+
+mecanisms :
+	- steal_job_from_queue
+	- push_job_onto_queue
+	- equilibrate_queues	-> balances 2 queues 
+	- reorder a queue
+
+policy role :
+	- implementing the push_job/fetch_job functions
+	- creating the actual queues 
+	- progression thread that regularly recompute a better schedule ?
+		- rather do that in the context of a task submission

+ 209 - 0
src/core/mechanisms/deque_queues.c

@@ -0,0 +1,209 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <core/mechanisms/deque_queues.h>
+#include <errno.h>
+
+/* keep track of the total number of jobs to be scheduled to avoid infinite 
+ * polling when there are really few jobs in the overall queue */
+static unsigned total_number_of_jobs;
+
+static pthread_cond_t *sched_cond;
+static pthread_mutex_t *sched_mutex;
+
+void init_deque_queues_mechanisms(void)
+{
+	total_number_of_jobs = 0;
+
+	struct sched_policy_s *sched = get_sched_policy();
+
+	/* to access them more easily, we keep their address in local variables */
+	sched_cond = &sched->sched_activity_cond;
+	sched_mutex = &sched->sched_activity_mutex;
+}
+
+struct jobq_s *create_deque(void)
+{
+	struct jobq_s *jobq;
+	jobq = malloc(sizeof(struct jobq_s));
+
+	pthread_mutex_init(&jobq->activity_mutex, NULL);
+	pthread_cond_init(&jobq->activity_cond, NULL);
+
+	struct deque_jobq_s *deque;
+	deque = malloc(sizeof(struct deque_jobq_s));
+
+	/* note that not all mechanisms (eg. the semaphore) have to be used */
+	deque->jobq = job_list_new();
+	deque->njobs = 0;
+	deque->nprocessed = 0;
+
+	deque->exp_start = timing_now()/1000000;
+	deque->exp_len = 0.0;
+	deque->exp_end = deque->exp_start;
+
+	jobq->queue = deque;
+
+	return jobq;
+}
+
+unsigned get_total_njobs_deques(void)
+{
+	return total_number_of_jobs;
+}
+
+unsigned get_deque_njobs(struct jobq_s *q)
+{
+	STARPU_ASSERT(q);
+
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	return deque_queue->njobs;
+}
+
+unsigned get_deque_nprocessed(struct jobq_s *q)
+{
+	STARPU_ASSERT(q);
+
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	return deque_queue->nprocessed;
+}
+
+int deque_push_prio_task(struct jobq_s *q, job_t task)
+{
+	return deque_push_task(q, task);
+}
+
+int deque_push_task(struct jobq_s *q, job_t task)
+{
+	STARPU_ASSERT(q);
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	total_number_of_jobs++;
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(task, 0);
+	job_list_push_front(deque_queue->jobq, task);
+	deque_queue->njobs++;
+	deque_queue->nprocessed++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return 0;
+}
+
+job_t deque_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	/* block until some task is available in that queue */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (deque_queue->njobs == 0)
+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
+
+	if (deque_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_front(deque_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		deque_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}
+
+job_t deque_non_blocking_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	/* block until some task is available in that queue */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (deque_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_front(deque_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		deque_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}
+
+job_t deque_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
+{
+	job_t j;
+
+	j = deque_non_blocking_pop_task(q);
+
+/* XXX */
+#if 0
+	if (!j) {
+		/* there is no job at all in the entire system : go to sleep ! */
+
+		/* that wait is not an absolute sign that there is some work 
+		 * if there is some, the thread should be awoken, but if there is none 
+		 * at the moment it is awoken, it may simply poll a limited number of 
+		 * times and just get back to sleep */
+		pthread_mutex_lock(sched_mutex);
+
+		if (total_number_of_jobs == 0)
+			pthread_cond_wait(sched_cond, sched_mutex);
+
+		pthread_mutex_unlock(sched_mutex);
+	}
+#endif
+
+	return j;
+}

+ 54 - 0
src/core/mechanisms/deque_queues.h

@@ -0,0 +1,54 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DEQUE_QUEUES_H__
+#define __DEQUE_QUEUES_H__
+
+#include <core/mechanisms/queues.h>
+
+struct deque_jobq_s {
+	/* the actual list */
+	job_list_t jobq;
+
+	/* the number of tasks currently in the queue */
+	unsigned njobs;
+
+	/* the number of tasks that were processed */
+	unsigned nprocessed;
+
+	/* only meaningful if the queue is only used by a single worker */
+	double exp_start;
+	double exp_end;
+	double exp_len;
+};
+
+struct jobq_s *create_deque(void);
+
+int deque_push_task(struct jobq_s *q, job_t task);
+int deque_push_prio_task(struct jobq_s *q, job_t task);
+
+job_t deque_pop_task(struct jobq_s *q);
+job_t deque_non_blocking_pop_task(struct jobq_s *q);
+job_t deque_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
+
+void init_deque_queues_mechanisms(void);
+
+
+unsigned get_deque_njobs(struct jobq_s *q);
+unsigned get_deque_nprocessed(struct jobq_s *q);
+
+
+#endif // __DEQUE_QUEUES_H__

+ 245 - 0
src/core/mechanisms/fifo_queues.c

@@ -0,0 +1,245 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <core/mechanisms/fifo_queues.h>
+#include <errno.h>
+
+/* keep track of the total number of jobs to be scheduled to avoid infinite 
+ * polling when there are really few jobs in the overall queue */
+static unsigned total_number_of_jobs;
+
+static pthread_cond_t *sched_cond;
+static pthread_mutex_t *sched_mutex;
+
+void init_fifo_queues_mechanisms(void)
+{
+	total_number_of_jobs = 0;
+
+	struct sched_policy_s *sched = get_sched_policy();
+
+	/* to access them more easily, we keep their address in local variables */
+	sched_cond = &sched->sched_activity_cond;
+	sched_mutex = &sched->sched_activity_mutex;
+}
+
+struct jobq_s *create_fifo(void)
+{
+	struct jobq_s *jobq;
+	jobq = malloc(sizeof(struct jobq_s));
+
+	pthread_mutex_init(&jobq->activity_mutex, NULL);
+	pthread_cond_init(&jobq->activity_cond, NULL);
+
+	struct fifo_jobq_s *fifo;
+	fifo = malloc(sizeof(struct fifo_jobq_s));
+
+	/* note that not all mechanisms (eg. the semaphore) have to be used */
+	fifo->jobq = job_list_new();
+	fifo->njobs = 0;
+	fifo->nprocessed = 0;
+
+	fifo->exp_start = timing_now()/1000000;
+	fifo->exp_len = 0.0;
+	fifo->exp_end = fifo->exp_start;
+
+	jobq->queue = fifo;
+
+	return jobq;
+}
+
+int fifo_push_prio_task(struct jobq_s *q, job_t task)
+{
+#ifndef NO_PRIO
+	STARPU_ASSERT(q);
+	struct fifo_jobq_s *fifo_queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	total_number_of_jobs++;
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+	
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(task, 0);
+	job_list_push_back(fifo_queue->jobq, task);
+	fifo_queue->njobs++;
+	fifo_queue->nprocessed++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return 0;
+#else
+	return fifo_push_task(q, task);
+#endif
+}
+
+int fifo_push_task(struct jobq_s *q, job_t task)
+{
+	STARPU_ASSERT(q);
+	struct fifo_jobq_s *fifo_queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	total_number_of_jobs++;
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+	
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(task, 0);
+	job_list_push_front(fifo_queue->jobq, task);
+	fifo_queue->njobs++;
+	fifo_queue->nprocessed++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return 0;
+}
+
+job_t fifo_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct fifo_jobq_s *fifo_queue = q->queue;
+
+	/* block until some event happens */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (fifo_queue->njobs == 0)
+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
+
+	if (fifo_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_back(fifo_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		fifo_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}
+
+struct job_list_s * fifo_pop_every_task(struct jobq_s *q)
+{
+	struct job_list_s *list;
+	unsigned size;
+	
+	STARPU_ASSERT(q);
+	struct fifo_jobq_s *fifo_queue = q->queue;
+
+	pthread_mutex_lock(&q->activity_mutex);
+
+	size = fifo_queue->njobs;
+
+	if (size == 0) {
+		list = NULL;
+	}
+	else {
+		/* directly use the existing list of jobs */
+		list = fifo_queue->jobq;
+
+	//	fprintf(stderr, "DEBUG, fifo_pop_every_task promised %d got %d\n",  size, job_list_size(list));
+		
+		/* the FIFO is now a new empty list */
+		fifo_queue->jobq = job_list_new();
+		fifo_queue->njobs = 0;
+
+		/* we are sure that we got it now, so at worst, some people thought
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs -= size;
+		pthread_mutex_unlock(sched_mutex);
+	}
+
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return list;
+}
+
+/* for work stealing, typically */
+job_t fifo_non_blocking_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct fifo_jobq_s *fifo_queue = q->queue;
+
+	/* block until some event happens */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (fifo_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_back(fifo_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		fifo_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}
+
+job_t fifo_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
+{
+	job_t j;
+
+	j = fifo_non_blocking_pop_task(q);
+
+	if (!j) {
+		/* there is no job at all in the entire system : go to sleep ! */
+
+		/* that wait is not an absolute sign that there is some work 
+		 * if there is some, the thread should be awoken, but if there is none 
+		 * at the moment it is awoken, it may simply poll a limited number of 
+		 * times and just get back to sleep */
+		pthread_mutex_lock(sched_mutex);
+
+		if (total_number_of_jobs == 0)
+			pthread_cond_wait(sched_cond, sched_mutex);
+
+		pthread_mutex_unlock(sched_mutex);
+	}
+
+	return j;
+}

+ 50 - 0
src/core/mechanisms/fifo_queues.h

@@ -0,0 +1,50 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __FIFO_QUEUES_H__
+#define __FIFO_QUEUES_H__
+
+#include <core/mechanisms/queues.h>
+
+struct fifo_jobq_s {
+	/* the actual list */
+	job_list_t jobq;
+
+	/* the number of tasks currently in the queue */
+	unsigned njobs;
+
+	/* the number of tasks that were processed */
+	unsigned nprocessed;
+
+	/* only meaningful if the queue is only used by a single worker */
+	double exp_start;
+	double exp_end;
+	double exp_len;
+};
+
+struct jobq_s *create_fifo(void);
+
+int fifo_push_task(struct jobq_s *q, job_t task);
+int fifo_push_prio_task(struct jobq_s *q, job_t task);
+
+job_t fifo_pop_task(struct jobq_s *q);
+struct job_list_s * fifo_pop_every_task(struct jobq_s *q);
+job_t fifo_non_blocking_pop_task(struct jobq_s *q);
+job_t fifo_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
+
+void init_fifo_queues_mechanisms(void);
+
+#endif // __FIFO_QUEUES_H__

+ 121 - 0
src/core/mechanisms/priority_queues.c

@@ -0,0 +1,121 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/mechanisms/priority_queues.h>
+
+/*
+ * Centralized queue with priorities 
+ */
+
+
+/* keep track of the total number of jobs to be scheduled to avoid infinite 
+ * polling when there are really few jobs in the overall queue */
+static pthread_cond_t *sched_cond;
+static pthread_mutex_t *sched_mutex;
+
+void init_priority_queues_mechanisms(void)
+{
+	struct sched_policy_s *sched = get_sched_policy();
+
+	/* to access them more easily, we keep their address in local variables */
+	sched_cond = &sched->sched_activity_cond;
+	sched_mutex = &sched->sched_activity_mutex;
+}
+
+struct jobq_s *create_priority_jobq(void)
+{
+	struct jobq_s *q;
+
+	q = malloc(sizeof(struct jobq_s));
+
+	struct priority_jobq_s *central_queue;
+	
+	central_queue = malloc(sizeof(struct priority_jobq_s));
+	q->queue = central_queue;
+
+	pthread_mutex_init(&q->activity_mutex, NULL);
+	pthread_cond_init(&q->activity_cond, NULL);
+
+	central_queue->total_njobs = 0;
+
+	unsigned prio;
+	for (prio = 0; prio < NPRIO_LEVELS; prio++)
+	{
+		central_queue->jobq[prio] = job_list_new();
+		central_queue->njobs[prio] = 0;
+	}
+
+	return q;
+}
+
+int priority_push_task(struct jobq_s *q, job_t j)
+{
+	STARPU_ASSERT(q);
+	struct priority_jobq_s *queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(j, 1);
+	
+	unsigned priolevel = j->task->priority - MIN_PRIO;
+
+	job_list_push_front(queue->jobq[priolevel], j);
+	queue->njobs[priolevel]++;
+	queue->total_njobs++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return 0;
+}
+
+job_t priority_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct priority_jobq_s *queue = q->queue;
+
+	/* block until some event happens */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (queue->total_njobs == 0)
+		 pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
+
+	if (queue->total_njobs > 0)
+	{
+		unsigned priolevel = NPRIO_LEVELS - 1;
+		do {
+			if (queue->njobs[priolevel] > 0) {
+				/* there is some task that we can grab */
+				j = job_list_pop_back(queue->jobq[priolevel]);
+				queue->njobs[priolevel]--;
+				queue->total_njobs--;
+				TRACE_JOB_POP(j, 0);
+			}
+		} while (!j && priolevel-- > 0);
+	}
+
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}

+ 43 - 0
src/core/mechanisms/priority_queues.h

@@ -0,0 +1,43 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PRIORITY_QUEUES_H__
+#define __PRIORITY_QUEUES_H__
+
+#define MIN_PRIO	(-4)
+#define MAX_PRIO	5
+
+#define NPRIO_LEVELS	((MAX_PRIO) - (MIN_PRIO) + 1)
+
+#include <core/mechanisms/queues.h>
+
+struct priority_jobq_s {
+	/* the actual lists 
+	 *	jobq[p] is for priority [p - MIN_PRIO] */
+	job_list_t jobq[NPRIO_LEVELS];
+	unsigned njobs[NPRIO_LEVELS];
+
+	unsigned total_njobs;
+};
+
+struct jobq_s *create_priority_jobq(void);
+void init_priority_queues_mechanisms(void);
+
+int priority_push_task(struct jobq_s *q, job_t task);
+
+job_t priority_pop_task(struct jobq_s *q);
+
+#endif // __PRIORITY_QUEUES_H__

+ 80 - 0
src/core/mechanisms/queues.c

@@ -0,0 +1,80 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "queues.h"
+
+/*
+ * There can be various queue designs
+ * 	- trivial single list
+ * 	- cilk-like 
+ * 	- hierarchical (marcel-like)
+ */
+
+void setup_queues(void (*init_queue_design)(void),
+		  struct jobq_s *(*func_init_queue)(void), 
+		  struct machine_config_s *config) 
+{
+	unsigned worker;
+
+	init_queue_design();
+
+	for (worker = 0; worker < config->nworkers; worker++)
+	{
+		struct  worker_s *workerarg = &config->workers[worker];
+		
+		workerarg->jobq = func_init_queue();
+
+		/* warning : in case there are multiple workers on the same
+                   queue, we overwrite this value so that it is meaningless
+		 */
+		workerarg->jobq->arch = workerarg->perf_arch;
+
+		switch (workerarg->arch) {
+			case CORE_WORKER:
+				workerarg->jobq->who |= CORE;
+				workerarg->jobq->alpha = CORE_ALPHA;
+				break;
+			case CUDA_WORKER:
+				workerarg->jobq->who |= CUDA|CUBLAS;
+				workerarg->jobq->alpha = CUDA_ALPHA;
+				break;
+			case GORDON_WORKER:
+				workerarg->jobq->who |= GORDON;
+				workerarg->jobq->alpha = GORDON_ALPHA;
+				break;
+			default:
+				STARPU_ASSERT(0);
+		}
+		
+		memory_node_attach_queue(workerarg->jobq, workerarg->memory_node);
+	}
+}
+
+/* this may return NULL for an "anonymous thread" */
+struct jobq_s *get_local_queue(void)
+{
+	struct sched_policy_s *policy = get_sched_policy();
+
+	return pthread_getspecific(policy->local_queue_key);
+}
+
+/* XXX how to retrieve policy ? that may be given in the machine config ? */
+void set_local_queue(struct jobq_s *jobq)
+{
+	struct sched_policy_s *policy = get_sched_policy();
+
+	pthread_setspecific(policy->local_queue_key, jobq);
+}

+ 72 - 0
src/core/mechanisms/queues.h

@@ -0,0 +1,72 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __QUEUES_H__
+#define __QUEUES_H__
+
+#include <pthread.h>
+
+#include <core/jobs.h>
+#include <core/policies/sched_policy.h>
+
+enum starpu_perf_archtype;
+
+struct jobq_s {
+	/* a pointer to some queue structure */
+	void *queue; 
+
+	/* some methods to manipulate the previous queue */
+	int (*push_task)(struct jobq_s *, job_t);
+	int (*push_prio_task)(struct jobq_s *, job_t);
+	struct job_s* (*pop_task)(struct jobq_s *);
+
+	/* returns the number of tasks that were retrieved 
+ 	 * the function is reponsible for allocating the output but the driver
+ 	 * has to free it 
+ 	 *
+ 	 * NB : this function is non blocking
+ 	 * */
+	struct job_list_s *(*pop_every_task)(struct jobq_s *);
+
+	/* what are the driver that may pop job from that queue ? */
+	uint32_t who;
+
+	/* this is only relevant if there is a single worker per queue */
+	uint32_t memory_node;
+	enum starpu_perf_archtype arch;
+	float alpha;
+
+	/* for performance analysis purpose */
+	double total_computation_time;
+	double total_communication_time;
+
+	/* in case workers are blocked on the queue, signaling on that 
+	  condition must unblock them, even if there is no available task */
+	pthread_cond_t activity_cond;
+	pthread_mutex_t activity_mutex;
+};
+
+struct machine_config_s;
+
+void setup_queues(void (*init_queue_design)(void),
+                  struct jobq_s *(*func_init_queue)(void),
+                  struct machine_config_s *config);
+
+struct jobq_s *get_local_queue(void);
+void set_local_queue(struct jobq_s *jobq);
+
+
+#endif // __QUEUES_H__

+ 228 - 0
src/core/mechanisms/stack_queues.c

@@ -0,0 +1,228 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <core/mechanisms/stack_queues.h>
+#include <errno.h>
+
+/* keep track of the total number of jobs to be scheduled to avoid infinite 
+ * polling when there are really few jobs in the overall queue */
+static unsigned total_number_of_jobs;
+
+static pthread_cond_t *sched_cond;
+static pthread_mutex_t *sched_mutex;
+
+void init_stack_queues_mechanisms(void)
+{
+	total_number_of_jobs = 0;
+
+	struct sched_policy_s *sched = get_sched_policy();
+
+	/* to access them more easily, we keep their address in local variables */
+	sched_cond = &sched->sched_activity_cond;
+	sched_mutex = &sched->sched_activity_mutex;
+}
+
+struct jobq_s *create_stack(void)
+{
+	struct jobq_s *jobq;
+	jobq = malloc(sizeof(struct jobq_s));
+
+	struct stack_jobq_s *stack;
+	stack = malloc(sizeof(struct stack_jobq_s));
+
+	pthread_mutex_init(&jobq->activity_mutex, NULL);
+	pthread_cond_init(&jobq->activity_cond, NULL);
+
+	/* note that not all mechanisms (eg. the semaphore) have to be used */
+	stack->jobq = job_list_new();
+	stack->njobs = 0;
+	stack->nprocessed = 0;
+
+	stack->exp_start = timing_now()/1000000;
+	stack->exp_len = 0.0;
+	stack->exp_end = stack->exp_start;
+
+	jobq->queue = stack;
+
+	return jobq;
+}
+
+unsigned get_total_njobs_stacks(void)
+{
+	return total_number_of_jobs;
+}
+
+unsigned get_stack_njobs(struct jobq_s *q)
+{
+	STARPU_ASSERT(q);
+
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	return stack_queue->njobs;
+}
+
+unsigned get_stack_nprocessed(struct jobq_s *q)
+{
+	STARPU_ASSERT(q);
+
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	return stack_queue->nprocessed;
+}
+
+void stack_push_prio_task(struct jobq_s *q, job_t task)
+{
+#ifndef NO_PRIO
+	STARPU_ASSERT(q);
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	total_number_of_jobs++;
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(task, 0);
+	job_list_push_back(stack_queue->jobq, task);
+	deque_queue->njobs++;
+	deque_queue->nprocessed++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+#else
+	stack_push_task(q, task);
+#endif
+}
+
+void stack_push_task(struct jobq_s *q, job_t task)
+{
+	STARPU_ASSERT(q);
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	/* if anyone is blocked on the entire machine, wake it up */
+	pthread_mutex_lock(sched_mutex);
+	total_number_of_jobs++;
+	pthread_cond_signal(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+
+	/* wake people waiting locally */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	TRACE_JOB_PUSH(task, 0);
+	job_list_push_front(stack_queue->jobq, task);
+	deque_queue->njobs++;
+	deque_queue->nprocessed++;
+
+	pthread_cond_signal(&q->activity_cond);
+	pthread_mutex_unlock(&q->activity_mutex);
+}
+
+job_t stack_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	/* block until some task is available in that queue */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (stack_queue->njobs == 0)
+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
+
+	if (stack_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_back(stack_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		stack_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+
+}
+
+/* for work stealing, typically */
+job_t stack_non_blocking_pop_task(struct jobq_s *q)
+{
+	job_t j = NULL;
+
+	STARPU_ASSERT(q);
+	struct stack_jobq_s *stack_queue = q->queue;
+
+	/* block until some task is available in that queue */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (stack_queue->njobs > 0) 
+	{
+		/* there is a task */
+		j = job_list_pop_back(stack_queue->jobq);
+	
+		STARPU_ASSERT(j);
+		stack_queue->njobs--;
+		
+		TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought 
+		 * there remained some work and will soon discover it is not true */
+		pthread_mutex_lock(sched_mutex);
+		total_number_of_jobs--;
+		pthread_mutex_unlock(sched_mutex);
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return j;
+}
+
+job_t stack_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
+{
+	job_t j;
+
+	j = stack_non_blocking_pop_task(q);
+
+	if (!j) {
+		/* there is no job at all in the entire system : go to sleep ! */
+
+		/* that wait is not an absolute sign that there is some work 
+		 * if there is some, the thread should be awoken, but if there is none 
+		 * at the moment it is awoken, it may simply poll a limited number of 
+		 * times and just get back to sleep */
+		pthread_mutex_lock(sched_mutex);
+
+		if (total_number_of_jobs == 0)
+			pthread_cond_wait(sched_cond, sched_mutex);
+
+		pthread_mutex_unlock(sched_mutex);
+	}
+
+	return j;
+}

+ 55 - 0
src/core/mechanisms/stack_queues.h

@@ -0,0 +1,55 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STACK_QUEUES_H__
+#define __STACK_QUEUES_H__
+
+#include <core/mechanisms/queues.h>
+
+struct stack_jobq_s {
+	/* the actual list */
+	job_list_t jobq;
+
+	/* the number of tasks currently in the queue */
+	unsigned njobs;
+
+	/* the number of tasks that were processed */
+	unsigned nprocessed;
+
+	/* only meaningful if the queue is only used by a single worker */
+	double exp_start;
+	double exp_end;
+	double exp_len;
+};
+
+struct jobq_s *create_stack(void);
+
+void stack_push_task(struct jobq_s *q, job_t task);
+
+void stack_push_prio_task(struct jobq_s *q, job_t task);
+
+job_t stack_pop_task(struct jobq_s *q);
+job_t stack_non_blocking_pop_task(struct jobq_s *q);
+job_t stack_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
+
+void init_stack_queues_mechanisms(void);
+
+
+unsigned get_stack_njobs(struct jobq_s *q);
+unsigned get_stack_nprocessed(struct jobq_s *q);
+
+
+#endif // __STACK_QUEUES_H__

+ 137 - 0
src/core/perfmodel/perfmodel.c

@@ -0,0 +1,137 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <unistd.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <datawizard/datawizard.h>
+
+/*
+ * PER ARCH model
+ */
+
+static double per_arch_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
+{
+	double exp = -1.0;
+	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
+	
+	if (!model->is_loaded)
+	{
+		if (starpu_get_env_number("CALIBRATE") != -1)
+		{
+			fprintf(stderr, "CALIBRATE model %s\n", model->symbol);
+			model->benchmarking = 1;
+		}
+		else {
+			model->benchmarking = 0;
+		}
+		
+		register_model(model);
+		model->is_loaded = 1;
+	}
+
+	per_arch_cost_model = model->per_arch[arch].cost_model;
+
+	if (per_arch_cost_model)
+		exp = per_arch_cost_model(j->task->buffers);
+
+	return exp;
+}
+
+/*
+ * Common model
+ */
+
+static double common_job_expected_length(struct starpu_perfmodel_t *model, uint32_t who, struct job_s *j)
+{
+	double exp;
+
+	if (model->cost_model) {
+		float alpha;
+		exp = model->cost_model(j->task->buffers);
+		switch (who) {
+			case CORE:
+				alpha = CORE_ALPHA;
+				break;
+			case CUDA:
+				alpha = CUDA_ALPHA;
+				break;
+			default:
+				/* perhaps there are various worker types on that queue */
+				alpha = 1.0; // this value is not significant ...
+				break;
+		}
+
+		STARPU_ASSERT(alpha != 0.0f);
+
+		return (exp/alpha);
+	}
+
+	return -1.0;
+}
+
+double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archtype arch)
+{
+	struct starpu_perfmodel_t *model = j->task->cl->model;
+
+	if (model) {
+		switch (model->type) {
+			case PER_ARCH:
+				return per_arch_job_expected_length(model, arch, j);
+
+			case COMMON:
+				return common_job_expected_length(model, who, j);
+
+			case HISTORY_BASED:
+				return history_based_job_expected_length(model, arch, j);
+
+			case REGRESSION_BASED:
+				return regression_based_job_expected_length(model, arch, j);
+
+			default:
+				STARPU_ASSERT(0);
+		};
+	}
+
+	/* no model was found */
+	return 0.0;
+}
+
+
+/* Data transfer performance modeling */
+double data_expected_penalty(struct jobq_s *q, struct job_s *j)
+{
+	uint32_t memory_node = q->memory_node;
+	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned buffer;
+
+	double penalty = 0.0;
+
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		data_state *state = j->task->buffers[buffer].state;
+
+		if (!is_data_present_or_requested(state, memory_node))
+		{
+			/* TODO */
+			penalty += 1000.0;
+		}
+	}
+
+	return penalty;
+}
+

+ 95 - 0
src/core/perfmodel/perfmodel.h

@@ -0,0 +1,95 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PERFMODEL_H__
+#define __PERFMODEL_H__
+
+#include <common/config.h>
+#include <starpu-perfmodel.h>
+//#include <core/jobs.h>
+#include <common/htable32.h>
+//#include <core/workers.h>
+#include <starpu-mutex.h>
+#include <stdio.h>
+
+struct starpu_buffer_descr_t;
+struct jobq_s;
+struct job_s;
+enum archtype;
+enum starpu_perf_archtype;
+
+struct starpu_history_entry_t {
+	//double measured;
+	
+	/* mean_n = 1/n sum */
+	double mean;
+
+	/* n dev_n = sum2 - 1/n (sum)^2 */
+	double deviation;
+
+	/* sum of samples */
+	double sum;
+
+	/* sum of samples^2 */
+	double sum2;
+
+//	/* sum of ln(measured) */
+//	double sumlny;
+//
+//	/* sum of ln(size) */
+//	double sumlnx;
+//	double sumlnx2;
+//
+//	/* sum of ln(size) ln(measured) */
+//	double sumlnxlny;
+//
+	unsigned nsample;
+
+	uint32_t footprint;
+	size_t size; /* in bytes */
+};
+
+struct starpu_history_list_t {
+	struct starpu_history_list_t *next;
+	struct starpu_history_entry_t *entry;
+};
+
+struct starpu_model_list_t {
+	struct starpu_model_list_t *next;
+	struct starpu_perfmodel_t *model;
+};
+
+//
+///* File format */
+//struct model_file_format {
+//	unsigned ncore_entries;
+//	unsigned ncuda_entries;
+//	/* contains core entries, then cuda ones */
+//	struct starpu_history_entry_t entries[];
+//}
+
+double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j);
+void register_model(struct starpu_perfmodel_t *model);
+void dump_registered_models(void);
+
+double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archtype arch);
+double regression_based_job_expected_length(struct starpu_perfmodel_t *model,
+		uint32_t who, struct job_s *j);
+void update_perfmodel_history(struct job_s *j, enum starpu_perf_archtype arch, double measured);
+
+double data_expected_penalty(struct jobq_s *q, struct job_s *j);
+
+#endif // __PERFMODEL_H__

+ 513 - 0
src/core/perfmodel/perfmodel_history.c

@@ -0,0 +1,513 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <starpu-mutex.h>
+#include <datawizard/datawizard.h>
+#include <core/perfmodel/regression.h>
+#include <common/config.h>
+
+/*
+ * History based model
+ */
+
+
+static void insert_history_entry(struct starpu_history_entry_t *entry, struct starpu_history_list_t **list, struct starpu_htbl32_node_s **history_ptr)
+{
+	struct starpu_history_list_t *link;
+	struct starpu_history_entry_t *old;
+
+	link = malloc(sizeof(struct starpu_history_list_t));
+	link->next = *list;
+	link->entry = entry;
+	*list = link;
+
+	old = htbl_insert_32(history_ptr, entry->footprint, entry);
+	/* that may fail in case there is some concurrency issue */
+	STARPU_ASSERT(old == NULL);
+}
+
+
+static void dump_reg_model(FILE *f, struct starpu_regression_model_t *reg_model)
+{
+	fprintf(f, "%le\t%le\t%le\t%le\t%le\t%le\t%d\n", reg_model->sumlnx, reg_model->sumlnx2, reg_model->sumlny, reg_model->sumlnxlny, reg_model->alpha, reg_model->beta, reg_model->nsample);
+}
+
+static void scan_reg_model(FILE *f, struct starpu_regression_model_t *reg_model)
+{
+	int res;
+
+	res = fscanf(f, "%le\t%le\t%le\t%le\t%le\t%le\t%d\n", &reg_model->sumlnx, &reg_model->sumlnx2, &reg_model->sumlny, &reg_model->sumlnxlny, &reg_model->alpha, &reg_model->beta, &reg_model->nsample);
+	STARPU_ASSERT(res == 7);
+}
+
+
+static void dump_history_entry(FILE *f, struct starpu_history_entry_t *entry)
+{
+	fprintf(f, "%x\t%zu\t%le\t%le\t%le\t%le\t%d\n", entry->footprint, entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
+}
+
+static void scan_history_entry(FILE *f, struct starpu_history_entry_t *entry)
+{
+	int res;
+
+	res = fscanf(f, "%x\t%zu\t%le\t%le\t%le\t%le\t%d\n", &entry->footprint, &entry->size, &entry->mean, &entry->deviation, &entry->sum, &entry->sum2, &entry->nsample);
+	STARPU_ASSERT(res == 7);
+}
+
+static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_t *per_arch_model, unsigned scan_history)
+{
+	unsigned nentries;
+
+	int res = fscanf(f, "%d\n", &nentries);
+	STARPU_ASSERT(res == 1);
+
+	scan_reg_model(f, &per_arch_model->regression);
+
+	res = fscanf(f, "%le\t%le\t%le\n", 
+		&per_arch_model->regression.a,
+		&per_arch_model->regression.b,
+		&per_arch_model->regression.c);
+	STARPU_ASSERT(res == 3);
+
+	if (isnan(per_arch_model->regression.a)||isnan(per_arch_model->regression.b)||isnan(per_arch_model->regression.c))
+	{
+		per_arch_model->regression.valid = 0;
+	}
+	else {
+		per_arch_model->regression.valid = 1;
+	}
+
+	if (!scan_history)
+		return;
+
+	/* parse core entries */
+	unsigned i;
+	for (i = 0; i < nentries; i++) {
+		struct starpu_history_entry_t *entry = malloc(sizeof(struct starpu_history_entry_t));
+		STARPU_ASSERT(entry);
+
+		scan_history_entry(f, entry);
+		
+		/* insert the entry in the hashtable and the list structures  */
+		insert_history_entry(entry, &per_arch_model->list, &per_arch_model->history);
+	}
+}
+
+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
+{
+	parse_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT], scan_history);
+	parse_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT], scan_history);
+}
+
+static void dump_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_t *per_arch_model)
+{
+	/* count the number of elements in the lists */
+	struct starpu_history_list_t *ptr;
+	unsigned nentries = 0;
+
+	ptr = per_arch_model->list;
+	while(ptr) {
+		nentries++;
+		ptr = ptr->next;
+	}
+
+	/* header */
+	fprintf(f, "%d\n", nentries);
+
+	dump_reg_model(f, &per_arch_model->regression);
+
+	double a,b,c;
+	regression_non_linear_power(per_arch_model->list, &a, &b, &c);
+	fprintf(f, "%le\t%le\t%le\n", a, b, c);
+
+	ptr = per_arch_model->list;
+	while (ptr) {
+		//memcpy(&entries_array[i++], ptr->entry, sizeof(struct starpu_history_entry_t));
+		dump_history_entry(f, ptr->entry);
+		ptr = ptr->next;
+	}
+}
+
+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
+{
+	dump_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT]);
+	dump_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT]);
+}
+
+static void initialize_per_arch_model(struct starpu_per_arch_perfmodel_t *per_arch_model)
+{
+	per_arch_model->history = NULL;
+	per_arch_model->list = NULL;
+}
+
+static void initialize_model(struct starpu_perfmodel_t *model)
+{
+	initialize_per_arch_model(&model->per_arch[STARPU_CORE_DEFAULT]);
+	initialize_per_arch_model(&model->per_arch[STARPU_CUDA_DEFAULT]);
+}
+
+static struct starpu_model_list_t *registered_models = NULL;
+//static unsigned debug_modelid = 0;
+
+#ifdef MODEL_DEBUG
+static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *arch, char *path, size_t maxlen)
+{
+	strncpy(path, PERF_MODEL_DIR, maxlen);
+	strncat(path, model->symbol, maxlen);
+	
+	char hostname[32];
+	gethostname(hostname, 32);
+	strncat(path, ".", maxlen);
+	strncat(path, hostname, maxlen);
+	strncat(path, ".", maxlen);
+	strncat(path, arch, maxlen);
+	strncat(path, ".debug", maxlen);
+}
+#endif
+
+
+void register_model(struct starpu_perfmodel_t *model)
+{
+	/* add the model to a linked list */
+	struct starpu_model_list_t *node = malloc(sizeof(struct starpu_model_list_t));
+
+	node->model = model;
+	//model->debug_modelid = debug_modelid++;
+
+	/* put this model at the beginning of the list */
+	node->next = registered_models;
+	registered_models = node;
+
+#ifdef MODEL_DEBUG
+	char debugpath[256];
+	get_model_debug_path(model, "cuda", debugpath, 256);
+	model->per_arch[STARPU_CUDA_DEFAULT].debug_file = fopen(debugpath, "a+");
+	STARPU_ASSERT(model->per_arch[STARPU_CUDA_DEFAULT].debug_file);
+
+	get_model_debug_path(model, "core", debugpath, 256);
+	model->per_arch[STARPU_CORE_DEFAULT].debug_file = fopen(debugpath, "a+");
+	STARPU_ASSERT(model->per_arch[STARPU_CORE_DEFAULT].debug_file);
+#endif
+
+	return;
+}
+
+static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t maxlen)
+{
+	strncpy(path, PERF_MODEL_DIR, maxlen);
+	strncat(path, model->symbol, maxlen);
+	
+	char hostname[32];
+	gethostname(hostname, 32);
+	strncat(path, ".", maxlen);
+	strncat(path, hostname, maxlen);
+}
+
+void save_history_based_model(struct starpu_perfmodel_t *model)
+{
+	STARPU_ASSERT(model);
+	STARPU_ASSERT(model->symbol);
+
+	/* TODO checks */
+
+	/* filename = $PERF_MODEL_DIR/symbol.hostname */
+	char path[256];
+	get_model_path(model, path, 256);
+
+#ifdef VERBOSE
+	fprintf(stderr, "Opening performance model file %s for model %s\n", path, model->symbol);
+#endif
+
+	/* overwrite existing file, or create it */
+	FILE *f;
+	f = fopen(path, "w+");
+	STARPU_ASSERT(f);
+
+	dump_model_file(f, model);
+
+	fclose(f);
+
+#ifdef DEBUG_MODEL
+	fclose(model->cuda_debug_file);
+	fclose(model->core_debug_file);
+#endif
+}
+
+void dump_registered_models(void)
+{
+	struct starpu_model_list_t *node;
+	node = registered_models;
+
+#ifdef VERBOSE
+	fprintf(stderr, "DUMP MODELS !\n");
+#endif
+
+	while (node) {
+		save_history_based_model(node->model);		
+		node = node->next;
+
+		/* XXX free node */
+	}
+}
+
+static int directory_existence_was_tested = 0;
+
+static void create_sampling_directory_if_needed(void)
+{
+	/* Testing if a directory exists and creating it otherwise 
+	   may not be safe: it is possible that the permission are
+	   changed in between. Instead, we create it and check if
+	   it already existed before */
+	int ret;
+	ret = mkdir(PERF_MODEL_DIR, S_IRWXU);
+	if (ret == -1)
+	{
+		STARPU_ASSERT(errno == EEXIST);
+
+		/* make sure that it is actually a directory */
+		struct stat sb;
+		stat(PERF_MODEL_DIR, &sb);
+		STARPU_ASSERT(S_ISDIR(sb.st_mode));
+	}
+}
+
+void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
+{
+	STARPU_ASSERT(model);
+	STARPU_ASSERT(model->symbol);
+
+	/* XXX we assume the lock is implicitely initialized (taken = 0) */
+	//init_mutex(&model->model_mutex);
+	take_mutex(&model->model_mutex);
+
+	/* perhaps some other thread got in before ... */
+	if (!model->is_loaded)
+	{
+		/* make sure the performance model directory exists (or create it) */
+		if (!directory_existence_was_tested)
+		{
+			create_sampling_directory_if_needed();
+			directory_existence_was_tested = 1;
+		}
+
+		/*
+		 * We need to keep track of all the model that were opened so that we can 
+		 * possibly update them at runtime termination ...
+		 */
+		register_model(model);
+	
+		char path[256];
+		get_model_path(model, path, 256);
+	
+#ifdef VERBOSE
+		fprintf(stderr, "Opening performance model file %s for model %s\n", path, model->symbol);
+#endif
+	
+		/* try to open an existing file and load it */
+		int res;
+		res = access(path, F_OK); 
+		if (res == 0) {
+		//	fprintf(stderr, "File exists !\n");
+	
+			FILE *f;
+			f = fopen(path, "r");
+			STARPU_ASSERT(f);
+	
+			parse_model_file(f, model, scan_history);
+	
+			fclose(f);
+		}
+		else {
+			//fprintf(stderr, "File does not exists !\n");
+			initialize_model(model);
+		}
+	
+	
+		if (starpu_get_env_number("CALIBRATE") != -1)
+		{
+			fprintf(stderr, "CALIBRATE model %s\n", model->symbol);
+			model->benchmarking = 1;
+		}
+		else {
+			model->benchmarking = 0;
+		}
+	
+		model->is_loaded = 1;
+	}
+
+	release_mutex(&model->model_mutex);
+}
+
+double regression_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
+{
+	double exp = -1.0;
+	size_t size = job_get_data_size(j);
+	struct starpu_regression_model_t *regmodel;
+
+	if (!model->is_loaded)
+		load_history_based_model(model, 0);
+
+	regmodel = &model->per_arch[arch].regression;
+
+	if (regmodel->valid)
+		exp = regmodel->a*pow(size, regmodel->b) + regmodel->c;
+
+	return exp;
+}
+
+double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
+{
+	double exp;
+	struct starpu_per_arch_perfmodel_t *per_arch_model;
+	struct starpu_history_entry_t *entry;
+	struct starpu_htbl32_node_s *history;
+
+	if (!model->is_loaded)
+		load_history_based_model(model, 1);
+
+	if (!j->footprint_is_computed)
+		compute_buffers_footprint(j);
+		
+	uint32_t key = j->footprint;
+
+	per_arch_model = &model->per_arch[arch];
+
+	history = per_arch_model->history;
+	if (!history)
+		return -1.0;
+
+	take_mutex(&model->model_mutex);
+	entry = htbl_search_32(history, key);
+	release_mutex(&model->model_mutex);
+
+	exp = entry?entry->mean:-1.0;
+
+	return exp;
+}
+
+void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double measured)
+{
+	struct starpu_perfmodel_t *model = j->task->cl->model;
+
+	if (model)
+	{
+		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch];
+
+		if (model->type == HISTORY_BASED || model->type == REGRESSION_BASED)
+		{
+			uint32_t key = j->footprint;
+			struct starpu_history_entry_t *entry;
+
+			struct starpu_htbl32_node_s *history;
+			struct starpu_htbl32_node_s **history_ptr;
+			struct starpu_regression_model_t *reg_model;
+
+			struct starpu_history_list_t **list;
+
+
+			history = per_arch_model->history;
+			history_ptr = &per_arch_model->history;
+			reg_model = &per_arch_model->regression;
+			list = &per_arch_model->list;
+
+			take_mutex(&model->model_mutex);
+	
+				entry = htbl_search_32(history, key);
+	
+				if (!entry)
+				{
+					/* this is the first entry with such a footprint */
+					entry = malloc(sizeof(struct starpu_history_entry_t));
+					STARPU_ASSERT(entry);
+						entry->mean = measured;
+						entry->sum = measured;
+	
+						entry->deviation = 0.0;
+						entry->sum2 = measured*measured;
+	
+						entry->size = job_get_data_size(j);
+	
+						entry->footprint = key;
+						entry->nsample = 1;
+	
+					insert_history_entry(entry, list, history_ptr);
+	
+				}
+				else {
+					/* there is already some entry with the same footprint */
+					entry->sum += measured;
+					entry->sum2 += measured*measured;
+					entry->nsample++;
+	
+					unsigned n = entry->nsample;
+					entry->mean = entry->sum / n;
+					entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
+				}
+			
+				STARPU_ASSERT(entry);
+			
+			/* update the regression model as well */
+			double logy, logx;
+			logx = logl(entry->size);
+			logy = logl(measured);
+
+			reg_model->sumlnx += logx;
+			reg_model->sumlnx2 += logx*logx;
+			reg_model->sumlny += logy;
+			reg_model->sumlnxlny += logx*logy;
+			reg_model->nsample++;
+
+			unsigned n = reg_model->nsample;
+			
+			double num = (n*reg_model->sumlnxlny - reg_model->sumlnx*reg_model->sumlny);
+			double denom = (n*reg_model->sumlnx2 - reg_model->sumlnx*reg_model->sumlnx);
+
+			reg_model->beta = num/denom;
+			reg_model->alpha = expl((reg_model->sumlny - reg_model->beta*reg_model->sumlnx)/n);
+			
+			release_mutex(&model->model_mutex);
+		}
+
+#ifdef MODEL_DEBUG
+		FILE * debug_file = per_arch_model->debug_file;
+
+		take_mutex(&model->model_mutex);
+
+		fprintf(debug_file, "%lf\t", measured);
+		unsigned i;
+			
+		for (i = 0; i < j->nbuffers; i++)
+		{
+			data_state *state = j->buffers[i].state;
+
+			STARPU_ASSERT(state->ops);
+			STARPU_ASSERT(state->ops->display);
+			state->ops->display(state, debug_file);
+		}
+		fprintf(debug_file, "\n");	
+
+
+		release_mutex(&model->model_mutex);
+#endif
+	}
+}

+ 225 - 0
src/core/perfmodel/regression.c

@@ -0,0 +1,225 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/perfmodel/regression.h>
+
+#define MAXREGITER	1000
+#define EPS 1.0e-10
+
+//#define MIN(a,b) ((a)<(b)?(a):(b))
+
+static double compute_b(double c, unsigned n, unsigned *x, double *y)
+{
+	double b;
+
+	/* X = log (x) , Y = log (y - c) */
+	double sumxy = 0.0;
+	double sumx = 0.0;
+	double sumx2 = 0.0;
+	double sumy = 0.0;
+
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+		double xi = logl(x[i]);
+		double yi = logl(y[i]-c);
+
+		sumxy += xi*yi;
+		sumx += xi;
+		sumx2 += xi*xi;
+		sumy += yi;
+	}
+
+	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
+
+	return b;
+}
+
+static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
+{
+	double a;
+
+	/* X = log (x) , Y = log (y - c) */
+	double sumx = 0.0;
+	double sumy = 0.0;
+
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+		double xi = logl(x[i]);
+		double yi = logl(y[i]-c);
+
+		sumx += xi;
+		sumy += yi;
+	}
+
+	a = (sumy - b*sumx) / n;
+
+	return a;
+}
+
+
+
+/* returns r */
+static double test_r(double c, unsigned n, unsigned *x, double *y)
+{
+	double r;
+
+//	printf("test c = %e\n", c);
+
+	/* X = log (x) , Y = log (y - c) */
+	double sumxy = 0.0;
+	double sumx = 0.0;
+	double sumx2 = 0.0;
+	double sumy = 0.0;
+	double sumy2 = 0.0;
+
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+		double xi = logl(x[i]);
+		double yi = logl(y[i]-c);
+
+	//	printf("Xi = %e, Yi = %e\n", xi, yi);
+
+		sumxy += xi*yi;
+		sumx += xi;
+		sumx2 += xi*xi;
+		sumy += yi;
+		sumy2 += yi*yi;
+	}
+
+	//printf("sumxy %e\n", sumxy);
+	//printf("sumx %e\n", sumx);
+	//printf("sumx2 %e\n", sumx2);
+	//printf("sumy %e\n", sumy);
+	//printf("sumy2 %e\n", sumy2);
+
+	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
+
+	return r;
+}
+
+static unsigned find_list_size(struct starpu_history_list_t *list_history)
+{
+	unsigned cnt = 0;
+
+	struct starpu_history_list_t *ptr = list_history;
+	while (ptr) {
+		cnt++;
+		ptr = ptr->next;
+	}
+
+	return cnt;
+}
+
+static double find_list_min(double *y, unsigned n)
+{
+	double min = 1.0e30;
+
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+		min = STARPU_MIN(min, y[i]);
+	}
+
+	return min;
+}
+
+static void dump_list(unsigned *x, double *y, struct starpu_history_list_t *list_history)
+{
+	struct starpu_history_list_t *ptr = list_history;
+	unsigned i = 0;
+
+	while (ptr) {
+		x[i] = ptr->entry->size;
+		y[i] = ptr->entry->mean;
+
+		ptr = ptr->next;
+		i++;
+	}
+}
+
+
+/* y = ax^b + c 
+ * 	return 0 if success, -1 otherwise
+ * 	if success, a, b and c are modified
+ * */
+int regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c)
+{
+	unsigned n = find_list_size(ptr);
+
+	unsigned *x = malloc(n*sizeof(unsigned));
+	STARPU_ASSERT(x);
+
+	double *y = malloc(n*sizeof(double));
+	STARPU_ASSERT(y);
+
+	dump_list(x, y, ptr);
+
+	double cmin = 0.0;
+	double cmax = find_list_min(y, n);
+	
+	unsigned iter;
+
+	double err = 100000.0;
+
+	for (iter = 0; iter < MAXREGITER; iter++)
+	{
+		double c1, c2;
+		double r1, r2;
+		
+		double radius = 0.01;
+
+		c1 = cmin + (0.5-radius)*(cmax - cmin);
+		c2 = cmin + (0.5+radius)*(cmax - cmin);
+
+		r1 = test_r(c1, n, x, y);
+		r2 = test_r(c2, n, x, y);
+
+		double err1, err2;
+		err1 = fabsl(1.0 - r1);
+		err2 = fabsl(1.0 - r2);
+
+		if (err1 < err2)
+		{
+			cmax = (cmin + cmax)/2;
+		}
+		else {
+			/* 2 is better */
+			cmin = (cmin + cmax)/2;
+		}
+
+		if (fabsl(err - STARPU_MIN(err1, err2)) < EPS)
+		{
+			err = STARPU_MIN(err1, err2);
+			break;
+		}
+
+		err = STARPU_MIN(err1, err2);
+	}
+
+	*c = (cmin + cmax)/2;
+
+	*b = compute_b(*c, n, x, y); 
+	*a = expl(compute_a(*c, *b, n, x, y));
+
+	free(x);
+	free(y);
+
+	return 0;
+}
+

+ 28 - 0
src/core/perfmodel/regression.h

@@ -0,0 +1,28 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __REGRESSION_H__
+#define __REGRESSION_H__
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <core/perfmodel/perfmodel.h>
+#include <starpu.h>
+
+int regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c);
+
+#endif // __REGRESSION_H__ 

+ 226 - 0
src/core/policies/deque-modeling-policy-data-aware.c

@@ -0,0 +1,226 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/deque-modeling-policy-data-aware.h>
+#include <core/perfmodel/perfmodel.h>
+
+static unsigned nworkers;
+static struct jobq_s *queue_array[NMAXWORKERS];
+
+static job_t dmda_pop_task(struct jobq_s *q)
+{
+	struct job_s *j;
+
+	j = fifo_pop_task(q);
+	if (j) {
+		struct fifo_jobq_s *fifo = q->queue;
+		double model = j->predicted;
+	
+		fifo->exp_len -= model;
+		fifo->exp_start = timing_now()/1000000 + model;
+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+	}	
+
+	return j;
+}
+
+static void update_data_requests(struct jobq_s *q, struct job_s *j)
+{
+	uint32_t memory_node = q->memory_node;
+	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned buffer;
+
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		data_state *state = j->task->buffers[buffer].state;
+
+		set_data_requested_flag_if_needed(state, memory_node);
+	}
+}
+
+static int _dmda_push_task(struct jobq_s *q __attribute__ ((unused)) , job_t j, unsigned prio)
+{
+	/* find the queue */
+	struct fifo_jobq_s *fifo;
+	unsigned worker;
+	int best = -1;
+	
+	/* this flag is set if the corresponding worker is selected because
+	   there is no performance prediction available yet */
+	int forced_best = -1;
+
+	double local_task_length[nworkers];
+	double local_data_penalty[nworkers];
+	double exp_end[nworkers];
+
+	double fitness[nworkers];
+
+	double best_exp_end = 10e240;
+	double model_best = 0.0;
+	double penality_best = 0.0;
+
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		fifo = queue_array[worker]->queue;
+
+		/* XXX */
+		fifo->exp_start = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
+		fifo->exp_end = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
+
+		if ((queue_array[worker]->who & j->task->cl->where) == 0)
+		{
+			/* no one on that queue may execute this task */
+			continue;
+		}
+
+		local_task_length[worker] = job_expected_length(queue_array[worker]->who,
+							j, queue_array[worker]->arch);
+
+		//local_data_penalty[worker] = 0;
+		local_data_penalty[worker] = data_expected_penalty(queue_array[worker], j);
+
+		if (local_task_length[worker] == -1.0)
+		{
+			forced_best = worker;
+			break;
+		}
+
+		exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
+
+		if (exp_end[worker] < best_exp_end)
+		{
+			/* a better solution was found */
+			best_exp_end = exp_end[worker];
+		}
+	}
+
+	double alpha = 1.0;
+	double beta = 1.0;
+
+	double best_fitness = -1;
+	
+	if (forced_best == -1)
+	{
+		for (worker = 0; worker < nworkers; worker++)
+		{
+			fifo = queue_array[worker]->queue;
+	
+			if ((queue_array[worker]->who & j->task->cl->where) == 0)
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
+	
+			fitness[worker] = alpha*(exp_end[worker] - best_exp_end) 
+					+ beta*(local_data_penalty[worker]);
+
+			if (best == -1 || fitness[worker] < best_fitness)
+			{
+				/* we found a better solution */
+				best_fitness = fitness[worker];
+				best = worker;
+
+	//			fprintf(stderr, "best fitness (worker %d) %le = alpha*(%le) + beta(%le) \n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker]);
+			}
+		}
+	}
+
+	STARPU_ASSERT(forced_best != -1 || best != -1);
+	
+	if (forced_best != -1)
+	{
+		/* there is no prediction available for that task
+		 * with that arch we want to speed-up calibration time
+		 * so we force this measurement */
+		best = worker;
+		model_best = 0.0;
+		penality_best = 0.0;
+	}
+	else 
+	{
+		model_best = local_task_length[best];
+		penality_best = local_data_penalty[best];
+	}
+
+	/* we should now have the best worker in variable "best" */
+	fifo = queue_array[best]->queue;
+
+	fifo->exp_end += model_best;
+	fifo->exp_len += model_best;
+
+	j->predicted = model_best;
+	j->penality = penality_best;
+
+	update_data_requests(queue_array[best], j);
+
+	if (prio) {
+		return fifo_push_prio_task(queue_array[best], j);
+	} else {
+		return fifo_push_task(queue_array[best], j);
+	}
+}
+
+static int dmda_push_prio_task(struct jobq_s *q, job_t j)
+{
+	return _dmda_push_task(q, j, 1);
+}
+
+static int dmda_push_task(struct jobq_s *q, job_t j)
+{
+	if (j->task->priority == MAX_PRIO)
+		return _dmda_push_task(q, j, 1);
+
+	return _dmda_push_task(q, j, 0);
+}
+
+static struct jobq_s *init_dmda_fifo(void)
+{
+	struct jobq_s *q;
+
+	q = create_fifo();
+
+	q->push_task = dmda_push_task; 
+	q->push_prio_task = dmda_push_prio_task; 
+	q->pop_task = dmda_pop_task;
+	q->who = 0;
+
+	queue_array[nworkers++] = q;
+
+	return q;
+}
+
+void initialize_dmda_policy(struct machine_config_s *config, 
+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
+{
+	nworkers = 0;
+
+	setup_queues(init_fifo_queues_mechanisms, init_dmda_fifo, config);
+}
+
+struct jobq_s *get_local_queue_dmda(struct sched_policy_s *policy __attribute__ ((unused)))
+{
+	struct jobq_s *queue;
+	queue = pthread_getspecific(policy->local_queue_key);
+
+	if (!queue)
+	{
+		/* take one randomly as this *must* be for a push anyway XXX */
+		queue = queue_array[0];
+	}
+
+	return queue;
+}
+

+ 29 - 0
src/core/policies/deque-modeling-policy-data-aware.h

@@ -0,0 +1,29 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DEQUE_MODELING_POLICY_DATA_AWARE_H__
+#define __DEQUE_MODELING_POLICY_DATA_AWARE_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/queues.h>
+#include <core/mechanisms/fifo_queues.h>
+
+void initialize_dmda_policy(struct machine_config_s *config,
+ __attribute__ ((unused)) struct sched_policy_s *_policy);
+
+struct jobq_s *get_local_queue_dmda(struct sched_policy_s *policy __attribute__ ((unused)));
+
+#endif // __DEQUE_MODELING_POLICY_DATA_AWARE_H__

+ 161 - 0
src/core/policies/deque-modeling-policy.c

@@ -0,0 +1,161 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/deque-modeling-policy.h>
+#include <core/perfmodel/perfmodel.h>
+
+static unsigned nworkers;
+static struct jobq_s *queue_array[NMAXWORKERS];
+
+static job_t dm_pop_task(struct jobq_s *q)
+{
+	struct job_s *j;
+
+	j = fifo_pop_task(q);
+	if (j) {
+		struct fifo_jobq_s *fifo = q->queue;
+		double model = j->predicted;
+	
+		fifo->exp_len -= model;
+		fifo->exp_start = timing_now()/1000000 + model;
+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+	}	
+
+	return j;
+}
+
+static int _dm_push_task(struct jobq_s *q __attribute__ ((unused)), job_t j, unsigned prio)
+{
+	/* find the queue */
+	struct fifo_jobq_s *fifo;
+	unsigned worker;
+	int best = -1;
+
+	double best_exp_end = 0.0;
+	double model_best = 0.0;
+
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		double exp_end;
+		
+		fifo = queue_array[worker]->queue;
+
+		/* XXX */
+		fifo->exp_start = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
+		fifo->exp_end = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
+
+		if ((queue_array[worker]->who & j->task->cl->where) == 0)
+		{
+			/* no one on that queue may execute this task */
+			continue;
+		}
+
+		double local_length = job_expected_length(queue_array[worker]->who, j, queue_array[worker]->arch);
+
+		if (local_length == -1.0) 
+		{
+			/* there is no prediction available for that task
+			 * with that arch we want to speed-up calibration time 
+			 * so we force this measurement */
+			/* XXX assert we are benchmarking ! */
+			best = worker;
+			model_best = 0.0;
+			exp_end = fifo->exp_start + fifo->exp_len;
+			break;
+		}
+
+
+		exp_end = fifo->exp_start + fifo->exp_len + local_length;
+
+		if (best == -1 || exp_end < best_exp_end)
+		{
+			/* a better solution was found */
+			best_exp_end = exp_end;
+			best = worker;
+			model_best = local_length;
+		}
+	}
+
+	
+	/* make sure someone coule execute that task ! */
+	STARPU_ASSERT(best != -1);
+
+	/* we should now have the best worker in variable "best" */
+	fifo = queue_array[best]->queue;
+
+	fifo->exp_end += model_best;
+	fifo->exp_len += model_best;
+
+	j->predicted = model_best;
+
+	if (prio) {
+		return fifo_push_prio_task(queue_array[best], j);
+	} else {
+		return fifo_push_task(queue_array[best], j);
+	}
+}
+
+static int dm_push_prio_task(struct jobq_s *q, job_t j)
+{
+	return _dm_push_task(q, j, 1);
+}
+
+static int dm_push_task(struct jobq_s *q, job_t j)
+{
+	if (j->task->priority == MAX_PRIO)
+		return _dm_push_task(q, j, 1);
+
+	return _dm_push_task(q, j, 0);
+}
+
+static struct jobq_s *init_dm_fifo(void)
+{
+	struct jobq_s *q;
+
+	q = create_fifo();
+
+	q->push_task = dm_push_task; 
+	q->push_prio_task = dm_push_prio_task; 
+	q->pop_task = dm_pop_task;
+	q->who = 0;
+
+	queue_array[nworkers++] = q;
+
+	return q;
+}
+
+void initialize_dm_policy(struct machine_config_s *config, 
+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
+{
+	nworkers = 0;
+
+	setup_queues(init_fifo_queues_mechanisms, init_dm_fifo, config);
+}
+
+struct jobq_s *get_local_queue_dm(struct sched_policy_s *policy __attribute__ ((unused)))
+{
+	struct jobq_s *queue;
+	queue = pthread_getspecific(policy->local_queue_key);
+
+	if (!queue)
+	{
+		/* take one randomly as this *must* be for a push anyway XXX */
+		queue = queue_array[0];
+	}
+
+	return queue;
+}
+

+ 29 - 0
src/core/policies/deque-modeling-policy.h

@@ -0,0 +1,29 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DEQUE_MODELING_POLICY_H__
+#define __DEQUE_MODELING_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/queues.h>
+#include <core/mechanisms/fifo_queues.h>
+
+void initialize_dm_policy(struct machine_config_s *config,
+ __attribute__ ((unused)) struct sched_policy_s *_policy);
+
+struct jobq_s *get_local_queue_dm(struct sched_policy_s *policy __attribute__ ((unused)));
+
+#endif // __DEQUE_MODELING_POLICY_H__

+ 58 - 0
src/core/policies/eager-central-policy.c

@@ -0,0 +1,58 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/eager-central-policy.h>
+
+/*
+ *	This is just the trivial policy where every worker use the same
+ *	JOB QUEUE.
+ */
+
+/* the former is the actual queue, the latter some container */
+static struct jobq_s *jobq;
+
+static void init_central_queue_design(void)
+{
+	/* there is only a single queue in that trivial design */
+	jobq = create_fifo();
+
+	init_fifo_queues_mechanisms();
+
+	jobq->push_task = fifo_push_task;
+	jobq->push_prio_task = fifo_push_prio_task;
+	jobq->pop_task = fifo_pop_task;
+
+	jobq->pop_every_task = fifo_pop_every_task;
+}
+
+static struct jobq_s *func_init_central_queue(void)
+{
+	/* once again, this is trivial */
+	return jobq;
+}
+
+void initialize_eager_center_policy(struct machine_config_s *config, 
+	   __attribute__ ((unused)) struct sched_policy_s *_policy) 
+{
+	setup_queues(init_central_queue_design, func_init_central_queue, config);
+}
+
+struct jobq_s *get_local_queue_eager(struct sched_policy_s *policy 
+					__attribute__ ((unused)))
+{
+	/* this is trivial for that strategy :) */
+	return jobq;
+}

+ 27 - 0
src/core/policies/eager-central-policy.h

@@ -0,0 +1,27 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __EAGER_CENTRAL_POLICY_H__
+#define __EAGER_CENTRAL_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/fifo_queues.h>
+
+void initialize_eager_center_policy(struct machine_config_s *config, struct sched_policy_s *policy);
+//void set_local_queue_eager(struct jobq_s *jobq);
+struct jobq_s *get_local_queue_eager(struct sched_policy_s *policy);
+
+#endif // __EAGER_CENTRAL_POLICY_H__

+ 52 - 0
src/core/policies/eager-central-priority-policy.c

@@ -0,0 +1,52 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/eager-central-priority-policy.h>
+
+/* the former is the actual queue, the latter some container */
+static struct jobq_s *jobq;
+
+static void init_priority_queue_design(void)
+{
+	/* only a single queue (even though there are several internaly) */
+	jobq = create_priority_jobq();
+
+	init_priority_queues_mechanisms();
+
+	/* we always use priorities in that policy */
+	jobq->push_task = priority_push_task;
+	jobq->push_prio_task = priority_push_task;
+	jobq->pop_task = priority_pop_task;
+}
+
+static struct jobq_s *func_init_priority_queue(void)
+{
+	return jobq;
+}
+
+void initialize_eager_center_priority_policy(struct machine_config_s *config, 
+			__attribute__ ((unused))	struct sched_policy_s *_policy) 
+{
+	setup_queues(init_priority_queue_design, func_init_priority_queue, config);
+}
+
+struct jobq_s *get_local_queue_eager_priority(struct sched_policy_s *policy __attribute__ ((unused)))
+{
+	/* this is trivial for that strategy */
+	return jobq;
+}
+
+

+ 28 - 0
src/core/policies/eager-central-priority-policy.h

@@ -0,0 +1,28 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __EAGER_CENTRAL_PRIORITY_POLICY_H__
+#define __EAGER_CENTRAL_PRIORITY_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/queues.h>
+#include <core/mechanisms/priority_queues.h>
+
+void initialize_eager_center_priority_policy(struct machine_config_s *config, struct sched_policy_s *policy);
+void set_local_queue_eager_priority(struct jobq_s *jobq);
+struct jobq_s *get_local_queue_eager_priority(struct sched_policy_s *policy);
+
+#endif // __EAGER_CENTRAL_PRIORITY_POLICY_H__

+ 57 - 0
src/core/policies/no-prio-policy.c

@@ -0,0 +1,57 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/no-prio-policy.h>
+
+/*
+ *	This is just the trivial policy where every worker use the same
+ *	JOB QUEUE.
+ */
+
+/* the former is the actual queue, the latter some container */
+static struct jobq_s *jobq;
+
+static void init_no_prio_design(void)
+{
+	/* there is only a single queue in that trivial design */
+	jobq = create_fifo();
+
+	init_fifo_queues_mechanisms();
+
+	jobq->push_task = fifo_push_task;
+	/* no priority in that policy, let's be stupid here */
+	jobq->push_prio_task = fifo_push_task;
+	jobq->pop_task = fifo_pop_task;
+}
+
+static struct jobq_s *func_init_central_queue(void)
+{
+	/* once again, this is trivial */
+	return jobq;
+}
+
+void initialize_no_prio_policy(struct machine_config_s *config, 
+	   __attribute__ ((unused)) struct sched_policy_s *_policy) 
+{
+	setup_queues(init_no_prio_design, func_init_central_queue, config);
+}
+
+struct jobq_s *get_local_queue_no_prio(struct sched_policy_s *policy 
+					__attribute__ ((unused)))
+{
+	/* this is trivial for that strategy :) */
+	return jobq;
+}

+ 27 - 0
src/core/policies/no-prio-policy.h

@@ -0,0 +1,27 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __NO_PRIO_POLICY_H__
+#define __NO_PRIO_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/fifo_queues.h>
+
+void initialize_no_prio_policy(struct machine_config_s *config, struct sched_policy_s *policy);
+//void set_local_queue_eager(struct jobq_s *jobq);
+struct jobq_s *get_local_queue_no_prio(struct sched_policy_s *policy);
+
+#endif // __NO_PRIO_POLICY_H__

+ 121 - 0
src/core/policies/random-policy.c

@@ -0,0 +1,121 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/random-policy.h>
+
+/* XXX 32 is set randomly */
+static unsigned nworkers;
+static struct jobq_s *queue_array[32];
+
+static job_t random_pop_task(struct jobq_s *q)
+{
+	struct job_s *j;
+
+	j = fifo_pop_task(q);
+
+	return j;
+}
+
+static int _random_push_task(struct jobq_s *q __attribute__ ((unused)), job_t task, unsigned prio)
+{
+	/* find the queue */
+	struct fifo_jobq_s *fifo;
+	unsigned worker;
+
+	unsigned selected = 0;
+
+	double alpha_sum = 0.0;
+
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		alpha_sum += queue_array[worker]->alpha;
+	}
+
+	double rand = drand48()*alpha_sum;
+//	fprintf(stderr, "my rand is %e\n", rand);
+
+	double alpha = 0.0;
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		if (alpha + queue_array[worker]->alpha > rand) {
+			/* we found the worker */
+			selected = worker;
+			break;
+		}
+
+		alpha += queue_array[worker]->alpha;
+	}
+
+	/* we should now have the best worker in variable "best" */
+	fifo = queue_array[selected]->queue;
+
+	if (prio) {
+		return fifo_push_prio_task(queue_array[selected], task);
+	} else {
+		return fifo_push_task(queue_array[selected], task);
+	}
+}
+
+static int random_push_prio_task(struct jobq_s *q, job_t task)
+{
+	return _random_push_task(q, task, 1);
+}
+
+static int random_push_task(struct jobq_s *q, job_t task)
+{
+	return _random_push_task(q, task, 0);
+}
+
+static struct jobq_s *init_random_fifo(void)
+{
+	struct jobq_s *q;
+
+	q = create_fifo();
+
+	q->push_task = random_push_task; 
+	q->push_prio_task = random_push_prio_task; 
+	q->pop_task = random_pop_task;
+	q->who = 0;
+
+	queue_array[nworkers++] = q;
+
+	return q;
+}
+
+void initialize_random_policy(struct machine_config_s *config, 
+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
+{
+	nworkers = 0;
+
+	srand48(time(NULL));
+
+	setup_queues(init_fifo_queues_mechanisms, init_random_fifo, config);
+}
+
+struct jobq_s *get_local_queue_random(struct sched_policy_s *policy __attribute__ ((unused)))
+{
+	struct jobq_s *queue;
+	queue = pthread_getspecific(policy->local_queue_key);
+
+	if (!queue)
+	{
+		/* take one randomly as this *must* be for a push anyway XXX */
+		queue = queue_array[0];
+	}
+
+	return queue;
+}
+

+ 29 - 0
src/core/policies/random-policy.h

@@ -0,0 +1,29 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __RANDOM_POLICY_H__
+#define __RANDOM_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/queues.h>
+#include <core/mechanisms/fifo_queues.h>
+
+void initialize_random_policy(struct machine_config_s *config,
+ __attribute__ ((unused)) struct sched_policy_s *_policy);
+
+struct jobq_s *get_local_queue_random(struct sched_policy_s *policy __attribute__ ((unused)));
+
+#endif // __RANDOM_POLICY_H__

+ 159 - 0
src/core/policies/sched_policy.c

@@ -0,0 +1,159 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+
+#include <core/mechanisms/queues.h>
+#include <core/policies/sched_policy.h>
+#include <core/policies/no-prio-policy.h>
+#include <core/policies/eager-central-policy.h>
+#include <core/policies/eager-central-priority-policy.h>
+#include <core/policies/work-stealing-policy.h>
+#include <core/policies/deque-modeling-policy.h>
+#include <core/policies/random-policy.h>
+#include <core/policies/deque-modeling-policy-data-aware.h>
+
+
+static struct sched_policy_s policy;
+
+struct sched_policy_s *get_sched_policy(void)
+{
+	return &policy;
+}
+
+void init_sched_policy(struct machine_config_s *config)
+{
+	/* eager policy is taken by default */
+	char *sched_env;
+	sched_env = getenv("SCHED");
+	if (sched_env) {
+		 if (strcmp(sched_env, "ws") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE WS SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_ws_policy;
+			policy.get_local_queue = get_local_queue_ws;
+		 }
+		 else if (strcmp(sched_env, "prio") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE PRIO EAGER SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_eager_center_priority_policy;
+			policy.get_local_queue = get_local_queue_eager_priority;
+		 }
+		 else if (strcmp(sched_env, "no-prio") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE _NO_ PRIO EAGER SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_no_prio_policy;
+			policy.get_local_queue = get_local_queue_no_prio;
+		 }
+		 else if (strcmp(sched_env, "dm") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE MODEL SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_dm_policy;
+			policy.get_local_queue = get_local_queue_dm;
+		 }
+		 else if (strcmp(sched_env, "dmda") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE DATA AWARE MODEL SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_dmda_policy;
+			policy.get_local_queue = get_local_queue_dmda;
+		 }
+		 else if (strcmp(sched_env, "random") == 0) {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE RANDOM SCHEDULER !! \n");
+#endif
+			policy.init_sched = initialize_random_policy;
+			policy.get_local_queue = get_local_queue_random;
+		 }
+		 else {
+#ifdef VERBOSE
+		 	fprintf(stderr, "USE EAGER SCHEDULER !! \n");
+#endif
+			/* default scheduler is the eager one */
+			policy.init_sched = initialize_eager_center_policy;
+			policy.get_local_queue = get_local_queue_eager;
+		 }
+	}
+	else {
+#ifdef VERBOSE
+	 	fprintf(stderr, "USE EAGER SCHEDULER !! \n");
+#endif
+		/* default scheduler is the eager one */
+		policy.init_sched = initialize_eager_center_policy;
+		policy.get_local_queue = get_local_queue_eager;
+	}
+
+	pthread_cond_init(&policy.sched_activity_cond, NULL);
+	pthread_mutex_init(&policy.sched_activity_mutex, NULL);
+	pthread_key_create(&policy.local_queue_key, NULL);
+
+	policy.init_sched(config, &policy);
+}
+
+/* the generic interface that call the proper underlying implementation */
+int push_task(job_t task)
+{
+	struct jobq_s *queue = policy.get_local_queue(&policy);
+
+	STARPU_ASSERT(queue->push_task);
+
+	return queue->push_task(queue, task);
+}
+
+struct job_s * pop_task_from_queue(struct jobq_s *queue)
+{
+	STARPU_ASSERT(queue->pop_task);
+
+	struct job_s *j = queue->pop_task(queue);
+
+	return j;
+}
+
+struct job_s * pop_task(void)
+{
+	struct jobq_s *queue = policy.get_local_queue(&policy);
+
+	return pop_task_from_queue(queue);
+}
+
+struct job_list_s * pop_every_task_from_queue(struct jobq_s *queue)
+{
+	STARPU_ASSERT(queue->pop_every_task);
+
+	struct job_list_s *list = queue->pop_every_task(queue);
+
+	return list;
+}
+
+struct job_list_s *pop_every_task(void)
+{
+	struct jobq_s *queue = policy.get_local_queue(&policy);
+
+	return pop_every_task_from_queue(queue);
+}
+
+void wait_on_sched_event(void)
+{
+	struct jobq_s *q = policy.get_local_queue(&policy);
+
+	pthread_mutex_lock(&q->activity_mutex);
+	pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
+	pthread_mutex_unlock(&q->activity_mutex);
+}

+ 56 - 0
src/core/policies/sched_policy.h

@@ -0,0 +1,56 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __SCHED_POLICY_H__
+#define __SCHED_POLICY_H__
+
+#include <core/mechanisms/queues.h>
+//#include <core/mechanisms/work_stealing_queues.h>
+//#include <core/mechanisms/central_queues.h>
+//#include <core/mechanisms/central_queues_priorities.h>
+
+#include <core/workers.h>
+
+struct machine_config_s;
+
+struct sched_policy_s {
+	/* create all the queues */
+	void (*init_sched)(struct machine_config_s *, struct sched_policy_s *);
+
+	/* anyone can request which queue it is associated to */
+	struct jobq_s *(*get_local_queue)(struct sched_policy_s *);
+
+	/* some worker may block until some activity happens in the machine */
+	pthread_cond_t sched_activity_cond;
+	pthread_mutex_t sched_activity_mutex;
+
+	pthread_key_t local_queue_key;
+};
+
+struct sched_policy_s *get_sched_policy(void);
+
+void init_sched_policy(struct machine_config_s *config);
+//void set_local_queue(struct jobq_s *jobq);
+
+int push_task(job_t task);
+struct job_s *pop_task(void);
+struct job_s *pop_task_from_queue(struct jobq_s *queue);
+struct job_list_s *pop_every_task(void);
+struct job_list_s * pop_every_task_from_queue(struct jobq_s *queue);
+
+void wait_on_sched_event(void);
+
+#endif // __SCHED_POLICY_H__

+ 201 - 0
src/core/policies/work-stealing-policy.c

@@ -0,0 +1,201 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/policies/work-stealing-policy.h>
+
+/* save the general machine configuration */
+//static struct machine_config_s *machineconfig;
+
+/* XXX 32 is set randomly */
+static unsigned nworkers;
+static unsigned rr_worker;
+static struct jobq_s *queue_array[32];
+
+/* keep track of the work performed from the beginning of the algorithm to make
+ * better decisions about which queue to select when stealing or deferring work
+ */
+static unsigned performed_total = 0;
+//static unsigned performed_local[16];
+
+#ifdef USE_OVERLOAD
+static float overload_metric(unsigned id)
+{
+	float execution_ratio = 0.0f;
+	if (performed_total > 0) {
+		execution_ratio = get_deque_nprocessed(queue_array[id])/performed_total;
+	}
+
+	unsigned performed_queue;
+	performed_queue = get_deque_nprocessed(queue_array[id]);
+
+	float current_ratio = 0.0f;
+	if (performed_queue > 0) {
+		current_ratio = get_deque_njobs(queue_array[id])/performed_queue;
+	}
+	
+	return (current_ratio - execution_ratio);
+}
+
+/* who to steal work to ? */
+static struct jobq_s *select_victimq(void)
+{
+	struct jobq_s *q;
+
+	unsigned attempts = nworkers;
+
+	unsigned worker = rr_worker;
+	do {
+		if (overload_metric(worker) > 0.0f)
+		{
+			q = queue_array[worker];
+			return q;
+		}
+		else {
+			worker = (worker + 1)%nworkers;
+		}
+	} while(attempts-- > 0);
+
+	/* take one anyway ... */
+	q = queue_array[rr_worker];
+	rr_worker = (rr_worker + 1 )%nworkers;
+
+	return q;
+}
+
+static struct jobq_s *select_workerq(void)
+{
+	struct jobq_s *q;
+
+	unsigned attempts = nworkers;
+
+	unsigned worker = rr_worker;
+	do {
+		if (overload_metric(worker) < 0.0f)
+		{
+			q = queue_array[worker];
+			return q;
+		}
+		else {
+			worker = (worker + 1)%nworkers;
+		}
+	} while(attempts-- > 0);
+
+	/* take one anyway ... */
+	q = queue_array[rr_worker];
+	rr_worker = (rr_worker + 1 )%nworkers;
+
+	return q;
+}
+
+#else
+
+/* who to steal work to ? */
+static struct jobq_s *select_victimq(void)
+{
+
+	struct jobq_s *q;
+
+	q = queue_array[rr_worker];
+
+	rr_worker = (rr_worker + 1 )%nworkers;
+
+	return q;
+}
+
+
+/* when anonymous threads submit tasks, 
+ * we need to select a queue where to dispose them */
+static struct jobq_s *select_workerq(void)
+{
+
+	struct jobq_s *q;
+
+	q = queue_array[rr_worker];
+
+	rr_worker = (rr_worker + 1 )%nworkers;
+
+	return q;
+}
+
+#endif
+
+static job_t ws_pop_task(struct jobq_s *q)
+{
+	job_t j;
+
+	j = deque_non_blocking_pop_task(q);
+	if (j) {
+		/* there was a local task */
+		performed_total++;
+		return j;
+	}
+	
+	/* we need to steal someone's job */
+	struct jobq_s *victimq;
+	victimq = select_victimq();
+
+	j = deque_non_blocking_pop_task_if_job_exists(victimq);
+
+	if (j)
+	{
+		TRACE_WORK_STEALING(q, j);
+		performed_total++;
+	}
+
+	return j;
+}
+
+static struct jobq_s *init_ws_deque(void)
+{
+	struct jobq_s *q;
+
+	q = create_deque();
+
+	q->push_task = deque_push_task; 
+	q->push_prio_task = deque_push_prio_task; 
+	q->pop_task = ws_pop_task;
+	q->who = 0;
+
+	queue_array[nworkers++] = q;
+
+	return q;
+}
+
+void initialize_ws_policy(struct machine_config_s *config, 
+			__attribute__ ((unused))	struct sched_policy_s *_policy) 
+{
+	nworkers = 0;
+	rr_worker = 0;
+
+	//machineconfig = config;
+
+	setup_queues(init_deque_queues_mechanisms, init_ws_deque, config);
+}
+
+struct jobq_s *get_local_queue_ws(struct sched_policy_s *policy __attribute__ ((unused)))
+{
+	struct jobq_s *queue;
+	queue = pthread_getspecific(policy->local_queue_key);
+
+	if (!queue) {
+		queue = select_workerq();
+	}
+
+	STARPU_ASSERT(queue);
+
+	return queue;
+}
+

+ 26 - 0
src/core/policies/work-stealing-policy.h

@@ -0,0 +1,26 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __WORK_STEALING_POLICY_H__
+#define __WORK_STEALING_POLICY_H__
+
+#include <core/workers.h>
+#include <core/mechanisms/deque_queues.h>
+
+void initialize_ws_policy(struct machine_config_s *config, struct sched_policy_s *policy);
+struct jobq_s *get_local_queue_ws(struct sched_policy_s *policy);
+
+#endif // __WORK_STEALING_POLICY_H__

+ 392 - 0
src/core/workers.c

@@ -0,0 +1,392 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/workers.h>
+
+/* XXX quick and dirty implementation for now ... */
+pthread_key_t local_workers_key;
+
+static struct machine_config_s config;
+
+/* in case a task is submitted, we may check whether there exists a worker
+   that may execute the task or not */
+static uint32_t worker_mask = 0;
+
+inline uint32_t worker_exists(uint32_t task_mask)
+{
+	return (task_mask & worker_mask);
+} 
+
+inline uint32_t may_submit_cuda_task(void)
+{
+	return ((CUDA|CUBLAS) & worker_mask);
+}
+
+inline uint32_t may_submit_core_task(void)
+{
+	return (CORE & worker_mask);
+}
+
+#ifdef USE_CPUS
+static unsigned ncores;
+#endif
+#ifdef USE_CUDA
+static unsigned ncudagpus;
+#endif
+#ifdef USE_GORDON
+static unsigned ngordon_spus;
+#endif
+
+/*
+ * Runtime initialization methods
+ */
+
+#ifdef USE_CUDA
+extern unsigned get_cuda_device_count(void);
+#endif
+
+static void init_machine_config(struct machine_config_s *config)
+{
+	int envval __attribute__((unused));
+	unsigned use_accelerator = 0;
+
+	config->nworkers = 0;
+
+#ifdef USE_CUDA
+	/* we need to initialize CUDA early to count the number of devices */
+	init_cuda();
+
+	envval = starpu_get_env_number("NCUDA");
+	if (envval < 0) {
+		ncudagpus = STARPU_MIN(get_cuda_device_count(), MAXCUDADEVS);
+	} else {
+		/* use the specified value */
+		ncudagpus = (unsigned)envval;
+		STARPU_ASSERT(ncudagpus <= MAXCUDADEVS);
+	}
+	STARPU_ASSERT(ncudagpus + config->nworkers <= NMAXWORKERS);
+
+	if (ncudagpus > 0)
+		use_accelerator = 1;
+
+	unsigned cudagpu;
+	for (cudagpu = 0; cudagpu < ncudagpus; cudagpu++)
+	{
+		config->workers[config->nworkers + cudagpu].arch = CUDA_WORKER;
+		config->workers[config->nworkers + cudagpu].perf_arch = STARPU_CUDA_DEFAULT;
+		config->workers[config->nworkers + cudagpu].id = cudagpu;
+		worker_mask |= (CUDA|CUBLAS);
+	}
+
+	config->nworkers += ncudagpus;
+#endif
+	
+#ifdef USE_GORDON
+	envval = starpu_get_env_number("NGORDON");
+	if (envval < 0) {
+		ngordon_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+	} else {
+		/* use the specified value */
+		ngordon_spus = (unsigned)envval;
+		STARPU_ASSERT(ngordon_spus <= NMAXGORDONSPUS);
+	}
+	STARPU_ASSERT(ngordon_spus + config->nworkers <= NMAXWORKERS);
+
+	if (ngordon_spus > 0)
+		use_accelerator = 1;
+
+	unsigned spu;
+	for (spu = 0; spu < ngordon_spus; spu++)
+	{
+		config->workers[config->nworkers + spu].arch = GORDON_WORKER;
+		config->workers[config->nworkers + spu].perf_arch = STARPU_GORDON_DEFAULT;
+		config->workers[config->nworkers + spu].id = spu;
+		config->workers[config->nworkers + spu].worker_is_running = 0;
+		worker_mask |= GORDON;
+	}
+
+	config->nworkers += ngordon_spus;
+#endif
+
+/* we put the CPU section after the accelerator : in case there was an
+ * accelerator found, we devote one core */
+#ifdef USE_CPUS
+	envval = starpu_get_env_number("NCPUS");
+	if (envval < 0) {
+		long avail_cores = sysconf(_SC_NPROCESSORS_ONLN) 
+						- (use_accelerator?1:0);
+		ncores = STARPU_MIN(avail_cores, NMAXCORES);
+	} else {
+		/* use the specified value */
+		ncores = (unsigned)envval;
+		STARPU_ASSERT(ncores <= NMAXCORES);
+	}
+	STARPU_ASSERT(ncores + config->nworkers <= NMAXWORKERS);
+
+	unsigned core;
+	for (core = 0; core < ncores; core++)
+	{
+		config->workers[config->nworkers + core].arch = CORE_WORKER;
+		config->workers[config->nworkers + core].perf_arch = STARPU_CORE_DEFAULT;
+		config->workers[config->nworkers + core].id = core;
+		worker_mask |= CORE;
+	}
+
+	config->nworkers += ncores;
+#endif
+
+
+	if (config->nworkers == 0)
+	{
+		fprintf(stderr, "No worker found, aborting ...\n");
+		exit(-1);
+	}
+}
+
+static void init_workers_binding(struct machine_config_s *config)
+{
+	/* launch one thread per CPU */
+	unsigned ram_memory_node;
+
+	int current_bindid = 0;
+
+	/* a single core is dedicated for the accelerators */
+	int accelerator_bindid = -1;
+
+	/* note that even if the CPU core are not used, we always have a RAM node */
+	/* TODO : support NUMA  ;) */
+	ram_memory_node = register_memory_node(RAM);
+
+	unsigned worker;
+	for (worker = 0; worker < config->nworkers; worker++)
+	{
+		unsigned memory_node = -1;
+		unsigned is_an_accelerator = 0;
+		struct worker_s *workerarg = &config->workers[worker];
+		
+		/* select the memory node that contains worker's memory */
+		switch (workerarg->arch) {
+			case CORE_WORKER:
+			/* "dedicate" a cpu core to that worker */
+				is_an_accelerator = 0;
+				memory_node = ram_memory_node;
+				break;
+#ifdef USE_GORDON
+			case GORDON_WORKER:
+				is_an_accelerator = 1;
+				memory_node = ram_memory_node;
+				break;
+#endif
+#ifdef USE_CUDA
+			case CUDA_WORKER:
+				is_an_accelerator = 1;
+				memory_node = register_memory_node(CUDA_RAM);
+				break;
+#endif
+			default:
+				STARPU_ASSERT(0);
+		}
+
+		if (is_an_accelerator) {
+			if (accelerator_bindid == -1)
+				accelerator_bindid = (current_bindid++) % (sysconf(_SC_NPROCESSORS_ONLN));
+			workerarg->bindid = accelerator_bindid;
+		}
+		else {
+			workerarg->bindid = (current_bindid++) % (sysconf(_SC_NPROCESSORS_ONLN));
+		}
+
+		workerarg->memory_node = memory_node;
+	}
+}
+
+#ifdef USE_GORDON
+unsigned gordon_inited = 0;	
+struct worker_set_s gordon_worker_set;
+#endif
+
+static void init_workers(struct machine_config_s *config)
+{
+	config->running = 1;
+
+	pthread_key_create(&local_workers_key, NULL);
+
+	unsigned worker;
+	for (worker = 0; worker < config->nworkers; worker++)
+	{
+		struct worker_s *workerarg = &config->workers[worker];
+
+		sem_init(&workerarg->ready_sem, 0, 0);
+
+		/* if some codelet's termination cannot be handled directly :
+		 * for instance in the Gordon driver, Gordon tasks' callbacks
+		 * may be executed by another thread than that of the Gordon
+		 * driver so that we cannot call the push_codelet_output method
+		 * directly */
+		workerarg->terminated_jobs = job_list_new();
+	
+		switch (workerarg->arch) {
+#ifdef USE_CPUS
+			case CORE_WORKER:
+				workerarg->set = NULL;
+				pthread_create(&workerarg->worker_thread, 
+						NULL, core_worker, workerarg);
+				sem_wait(&workerarg->ready_sem);
+				break;
+#endif
+#ifdef USE_CUDA
+			case CUDA_WORKER:
+				workerarg->set = NULL;
+				pthread_create(&workerarg->worker_thread, 
+						NULL, cuda_worker, workerarg);
+				sem_wait(&workerarg->ready_sem);
+				break;
+#endif
+#ifdef USE_GORDON
+			case GORDON_WORKER:
+				/* we will only launch gordon once, but it will handle 
+				 * the different SPU workers */
+				if (!gordon_inited)
+				{
+					gordon_worker_set.nworkers = ngordon_spus; 
+					gordon_worker_set.workers = &config->workers[worker];
+
+					pthread_create(&gordon_worker_set.worker_thread, NULL, 
+							gordon_worker, &gordon_worker_set);
+					sem_wait(&gordon_worker_set.ready_sem);
+
+					gordon_inited = 1;
+				}
+				
+				workerarg->set = &gordon_worker_set;
+				gordon_worker_set.joined = 0;
+				workerarg->worker_is_running = 1;
+
+				break;
+#endif
+			default:
+				STARPU_ASSERT(0);
+		}
+	}
+}
+
+void starpu_init(void)
+{
+	srand(2008);
+
+#ifdef USE_FXT
+	start_fxt_profiling();
+#endif
+
+	timing_init();
+
+	init_machine_config(&config);
+
+	/* for the data wizard */
+	init_memory_nodes();
+
+	init_workers_binding(&config);
+
+	/* initialize the scheduler */
+
+	/* initialize the queue containing the jobs */
+	init_sched_policy(&config);
+
+	init_workers(&config);
+}
+
+/*
+ * Handle runtime termination 
+ */
+
+void terminate_workers(struct machine_config_s *config)
+{
+	int status;
+	unsigned workerid;
+
+	for (workerid = 0; workerid < config->nworkers; workerid++)
+	{
+		wake_all_blocked_workers();
+		
+#ifdef VERBOSE
+		fprintf(stderr, "wait for worker %d\n", workerid);
+#endif
+
+		struct worker_set_s *set = config->workers[workerid].set;
+
+		/* in case StarPU termination code is called from a callback,
+ 		 * we have to check if pthread_self() is the worker itself */
+		if (set){ 
+			if (!set->joined) {
+				if (pthread_self() != set->worker_thread)
+				{
+					status = pthread_join(set->worker_thread, NULL);
+#ifdef VERBOSE
+					if (status)
+						fprintf(stderr, "pthread_join -> %d\n", status);
+#endif
+				}
+
+				set->joined = 1;
+			}
+		}
+		else {
+			struct worker_s *worker = &config->workers[workerid];
+			if (pthread_self() != worker->worker_thread)
+			{
+				status = pthread_join(worker->worker_thread, NULL);
+#ifdef VERBOSE
+				if (status)
+					fprintf(stderr, "pthread_join -> %d\n", status);
+#endif
+			}
+		}
+	}
+}
+
+unsigned machine_is_running(void)
+{
+	return config.running;
+}
+
+void kill_all_workers(struct machine_config_s *config)
+{
+	/* set the flag which will tell workers to stop */
+	config->running = 0;
+
+	/* in case some workers are waiting on some event 
+	   wake them up ... */
+	wake_all_blocked_workers();
+}
+
+void starpu_shutdown(void)
+{
+	display_msi_stats();
+	display_alloc_cache_stats();
+
+	/* tell all workers to shutdown */
+	kill_all_workers(&config);
+
+#ifdef DATA_STATS
+	display_comm_ammounts();
+#endif
+
+	if (starpu_get_env_number("CALIBRATE") != -1)
+		dump_registered_models();
+
+	/* wait for their termination */
+	terminate_workers(&config);
+}

+ 110 - 0
src/core/workers.h

@@ -0,0 +1,110 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __WORKERS_H__
+#define __WORKERS_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <common/config.h>
+#include <pthread.h>
+#include <common/timing.h>
+#include <common/fxt.h>
+#include <core/jobs.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/policies/sched_policy.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <drivers/cuda/driver_cuda.h>
+#endif
+
+#ifdef USE_GORDON
+#include <drivers/gordon/driver_gordon.h>
+#endif
+
+#include <drivers/core/driver_core.h>
+
+#include <datawizard/datawizard.h>
+
+#define CORE_ALPHA	1.0f
+#define CUDA_ALPHA	13.33f
+#define GORDON_ALPHA	6.0f /* XXX this is a random value ... */
+
+#define NMAXWORKERS	16
+
+#ifdef DATA_STATS
+#define BENCHMARK_COMM	1
+#else
+#define BENCHMARK_COMM	0
+#endif
+
+enum archtype {
+	CORE_WORKER,
+	CUDA_WORKER,
+	GORDON_WORKER
+};
+
+struct worker_s {
+	enum archtype arch; /* what is the type of worker ? */
+	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
+	pthread_t worker_thread; /* the thread which runs the worker */
+	int id; /* which core/gpu/etc is controlled by the workker ? */
+        sem_t ready_sem; /* indicate when the worker is ready */
+	int bindid; /* which core is the driver bound to ? */
+	unsigned memory_node; /* which memory node is associated that worker to ? */
+	struct jobq_s *jobq; /* in which queue will that worker get/put tasks ? */
+	struct worker_set_s *set; /* in case this worker belongs to a set */
+	struct job_list_s *terminated_jobs; /* list of pending jobs which were executed */
+	unsigned worker_is_running;
+};
+
+/* in case a single CPU worker may control multiple 
+ * accelerators (eg. Gordon for n SPUs) */
+struct worker_set_s {
+	pthread_t worker_thread; /* the thread which runs the worker */
+	unsigned nworkers;
+	unsigned joined; /* only one thread may call pthread_join*/
+	void *retval;
+	struct worker_s *workers;
+        sem_t ready_sem; /* indicate when the worker is ready */
+};
+
+struct machine_config_s {
+	unsigned nworkers;
+
+	struct worker_s workers[NMAXWORKERS];
+
+	/* this flag is set until the runtime is stopped */
+	unsigned running;
+};
+
+void terminate_workers(struct machine_config_s *config);
+void kill_all_workers(struct machine_config_s *config);
+void display_general_stats(void);
+
+unsigned machine_is_running(void);
+
+inline uint32_t worker_exists(uint32_t task_mask);
+inline uint32_t may_submit_cuda_task(void);
+inline uint32_t may_submit_core_task(void);
+
+
+#endif // __WORKERS_H__

+ 68 - 0
src/datawizard/Makefile

@@ -0,0 +1,68 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+.PHONY: interfaces common
+
+OBJS := write_back.o coherency.o data_request.o progress.o copy-driver.o hierarchy.o memalloc.o footprint.o datastats.o
+
+DWOBJDEPS += ../common/hash.o
+DWOBJDEPS += ../common/timing.o 
+DWOBJDEPS += ../common/htable32.o 
+DWOBJDEPS += ../common/mutex.o 
+DWOBJDEPS += ../common/rwlock.o 
+DWOBJDEPS += progress.o
+DWOBJDEPS += write_back.o
+DWOBJDEPS += copy-driver.o
+DWOBJDEPS += data_request.o
+DWOBJDEPS += coherency.o 
+DWOBJDEPS += hierarchy.o 
+DWOBJDEPS += memalloc.o
+DWOBJDEPS += footprint.o
+DWOBJDEPS += interfaces/blas_filters.o
+DWOBJDEPS += interfaces/csr_filters.o
+DWOBJDEPS += interfaces/bcsr_filters.o
+DWOBJDEPS += interfaces/vector_filters.o
+DWOBJDEPS += interfaces/blas_interface.o
+DWOBJDEPS += interfaces/csr_interface.o
+DWOBJDEPS += interfaces/bcsr_interface.o
+DWOBJDEPS += interfaces/vector_interface.o
+
+
+all: datawizard.a interfaces $(SPE_TARGET) $(OBJS)
+
+datawizard.so: common interfaces $(SPE_TARGET) $(OBJS) 
+	gcc --shared -o datawizard.so $(DWOBJDEPS)
+
+datawizard.a: common interfaces $(SPE_TARGET) $(OBJS)
+	$(AR) rcs $@ $(DWOBJDEPS)
+
+common:
+	@make -C ../common/
+
+interfaces:
+	@make -C interfaces
+
+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
+%.d: %.c
+	$(CC) $(CFLAGS) $< -MM -o $*.d
+
+-include $(OBJS:.o=.d)
+endif
+
+clean:
+	@make -C interfaces clean
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f *.a *.so

+ 395 - 0
src/datawizard/coherency.c

@@ -0,0 +1,395 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/write_back.h>
+#include <core/dependencies/data-concurrency.h>
+
+/* this function will actually copy a valid data into the requesting node */
+static int __attribute__((warn_unused_result)) copy_data_to_node(data_state *state, uint32_t requesting_node, 
+						 unsigned donotread)
+{
+	/* first find a valid copy, either a OWNER or a SHARED */
+	int ret;
+	uint32_t node;
+	uint32_t src_node_mask = 0;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		if (state->per_node[node].state != INVALID) {
+			/* we found a copy ! */
+			src_node_mask |= (1<<node);
+		}
+	}
+
+	/* we should have found at least one copy ! */
+	STARPU_ASSERT(src_node_mask != 0);
+
+	ret = driver_copy_data(state, src_node_mask, requesting_node, donotread);
+
+	return ret;
+}
+
+/* this may be called once the data is fetched with header and RW-lock hold */
+static void update_data_state(data_state *state, uint32_t requesting_node,
+				uint8_t write)
+{
+	/* the data is present now */
+	state->per_node[requesting_node].requested = 0;
+
+	if (write) {
+		/* the requesting node now has the only valid copy */
+		uint32_t node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			state->per_node[node].state = INVALID;
+		}
+		state->per_node[requesting_node].state = OWNER;
+	}
+	else { /* read only */
+		/* there was at least another copy of the data */
+		uint32_t node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			if (state->per_node[node].state != INVALID)
+				state->per_node[node].state = SHARED;
+		}
+		state->per_node[requesting_node].state = SHARED;
+	}
+}
+
+
+/*
+ * This function is called when the data is needed on the local node, this
+ * returns a pointer to the local copy 
+ *
+ *			R 	W 	RW
+ *	Owner		OK	OK	OK
+ *	Shared		OK	1	1
+ *	Invalid		2	3	4
+ *
+ * case 1 : shared + (read)write : 
+ * 	no data copy but shared->Invalid/Owner
+ * case 2 : invalid + read : 
+ * 	data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
+ * case 3 : invalid + write : 
+ * 	no data copy + invalid->owner + (owner,shared)->invalid
+ * case 4 : invalid + R/W : 
+ * 	data copy + if (W) (invalid->owner + owner->invalid) 
+ * 		    else (invalid,owner->shared)
+ */
+
+int _fetch_data(data_state *state, uint32_t requesting_node,
+			uint8_t read, uint8_t write)
+{
+	while (take_mutex_try(&state->header_lock)) {
+		datawizard_progress(requesting_node);
+	}
+
+	cache_state local_state;
+	local_state = state->per_node[requesting_node].state;
+
+	/* we handle that case first to optimize the OWNER path */
+	if ((local_state == OWNER) || (local_state == SHARED && !write))
+	{
+		/* the local node already got its data */
+		release_mutex(&state->header_lock);
+		msi_cache_hit(requesting_node);
+		return 0;
+	}
+
+	if ((local_state == SHARED) && write) {
+		/* local node already has the data but it must invalidate 
+		 * other copies */
+		uint32_t node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			if (state->per_node[node].state == SHARED) 
+			{
+				state->per_node[node].state =
+					(node == requesting_node ? OWNER:INVALID);
+			}
+
+		}
+		
+		release_mutex(&state->header_lock);
+		msi_cache_hit(requesting_node);
+		return 0;
+	}
+
+	/* the only remaining situation is that the local copy was invalid */
+	STARPU_ASSERT(state->per_node[requesting_node].state == INVALID);
+
+	msi_cache_miss(requesting_node);
+
+	/* we need the data from either the owner or one of the sharer */
+	int ret;
+	ret = copy_data_to_node(state, requesting_node, !read);
+	if (ret != 0)
+	switch (ret) {
+		case -ENOMEM:
+			goto enomem;
+		
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	update_data_state(state, requesting_node, write);
+
+	release_mutex(&state->header_lock);
+
+	return 0;
+
+enomem:
+	/* there was not enough local memory to fetch the data */
+	release_mutex(&state->header_lock);
+	return -ENOMEM;
+}
+
+int fetch_data(data_state *state, starpu_access_mode mode)
+{
+	int ret;
+	uint32_t requesting_node = get_local_memory_node(); 
+
+	uint8_t read, write;
+	read = (mode != W); /* then R or RW */
+	write = (mode != R); /* then W or RW */
+
+#ifndef NO_DATA_RW_LOCK
+	if (write) {
+//		take_rw_lock_write(&state->data_lock);
+		while (take_rw_lock_write_try(&state->data_lock))
+			datawizard_progress(requesting_node);
+	} else {
+//		take_rw_lock_read(&state->data_lock);
+		while (take_rw_lock_read_try(&state->data_lock))
+			datawizard_progress(requesting_node);
+	}
+#endif
+
+	while (take_mutex_try(&state->header_lock))
+		datawizard_progress(requesting_node);
+
+	state->per_node[requesting_node].refcnt++;
+	release_mutex(&state->header_lock);
+
+	ret = _fetch_data(state, requesting_node, read, write);
+	if (ret != 0)
+		goto enomem;
+
+	return 0;
+enomem:
+	/* we did not get the data so remove the lock anyway */
+	while (take_mutex_try(&state->header_lock))
+		datawizard_progress(requesting_node);
+
+	state->per_node[requesting_node].refcnt--;
+	release_mutex(&state->header_lock);
+
+#ifndef NO_DATA_RW_LOCK
+	release_rw_lock(&state->data_lock);
+#endif
+
+	return -1;
+}
+
+uint32_t get_data_refcnt(data_state *state, uint32_t node)
+{
+	return state->per_node[node].refcnt;
+}
+
+/* in case the data was accessed on a write mode, do not forget to 
+ * make it accessible again once it is possible ! */
+static void release_data(data_state *state, uint32_t default_wb_mask)
+{
+	uint32_t wb_mask;
+
+	/* normally, the requesting node should have the data in an exclusive manner */
+	uint32_t requesting_node = get_local_memory_node();
+	STARPU_ASSERT(state->per_node[requesting_node].state != INVALID);
+
+	wb_mask = default_wb_mask | state->wb_mask;
+
+	/* are we doing write-through or just some normal write-back ? */
+	if (wb_mask & ~(1<<requesting_node)) {
+		write_through_data(state, requesting_node, wb_mask);
+	}
+
+	while (take_mutex_try(&state->header_lock))
+		datawizard_progress(requesting_node);
+
+	state->per_node[requesting_node].refcnt--;
+	release_mutex(&state->header_lock);
+
+#ifndef NO_DATA_RW_LOCK
+	/* this is intended to make data accessible again */
+	release_rw_lock(&state->data_lock);
+#else
+	notify_data_dependencies(state);
+#endif
+}
+
+int fetch_codelet_input(starpu_buffer_descr *descrs, starpu_data_interface_t *interface, unsigned nbuffers, uint32_t mask)
+{
+	TRACE_START_FETCH_INPUT(NULL);
+
+	uint32_t local_memory_node = get_local_memory_node();
+
+	unsigned index;
+	for (index = 0; index < nbuffers; index++)
+	{
+		int ret;
+		starpu_buffer_descr *descr;
+		data_state *state;
+
+		descr = &descrs[index];
+
+		state = descr->state;
+
+		ret = fetch_data(state, descr->mode);
+		if (STARPU_UNLIKELY(ret))
+			goto enomem;
+
+		memcpy(&interface[index], &state->interface[local_memory_node], 
+				sizeof(starpu_data_interface_t));
+	}
+
+	TRACE_END_FETCH_INPUT(NULL);
+
+	return 0;
+
+enomem:
+	/* try to unreference all the input that were successfully taken */
+	fprintf(stderr, "something went wrong with buffer %d\n", index);
+	push_codelet_output(descrs, index, mask);
+	return -1;
+}
+
+void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_t mask)
+{
+	TRACE_START_PUSH_OUTPUT(NULL);
+
+	unsigned index;
+	for (index = 0; index < nbuffers; index++)
+	{
+		release_data(descrs[index].state, mask);
+	}
+
+	TRACE_END_PUSH_OUTPUT(NULL);
+}
+
+int request_data_allocation(data_state *state, uint32_t node)
+{
+	take_mutex(&state->header_lock);
+
+	int ret;
+	ret = allocate_per_node_buffer(state, node);
+	STARPU_ASSERT(ret == 0);
+
+	/* XXX quick and dirty hack */
+	state->per_node[node].automatically_allocated = 0;	
+
+	release_mutex(&state->header_lock);
+
+	return 0;
+}
+
+#ifdef NO_DATA_RW_LOCK
+/* put the current value of the data into RAM */
+static void _starpu_sync_data_with_mem_continuation(void *_state)
+{
+	int ret;
+	data_state *state = _state;
+
+	ret = fetch_data(state, R);
+	
+	STARPU_ASSERT(!ret);
+}
+#endif
+
+void starpu_sync_data_with_mem(data_state *state)
+{
+	int ret;
+
+#ifdef NO_DATA_RW_LOCK
+	/* we try to get the data, if we do not succeed immediately, we set a
+ 	* callback function that will be executed automatically when the data is
+ 	* available again, otherwise we fetch the data directly */
+	if (!attempt_to_submit_data_request_from_apps(state, R, _starpu_sync_data_with_mem_continuation, state))
+	{
+		ret = fetch_data(state, R);
+		STARPU_ASSERT(!ret);
+	}
+#else
+	ret = fetch_data(state, R);
+	STARPU_ASSERT(!ret);
+#endif
+}
+
+/* in case the application did modify the data ... invalidate all other copies  */
+void notify_data_modification(data_state *state, uint32_t modifying_node)
+{
+	/* this may block .. XXX */
+#ifndef NO_DATA_RW_LOCK
+	take_rw_lock_write(&state->data_lock);
+#else
+#warning notify_data_modification is not supported with NO_DATA_RW_LOCK yet
+#endif
+
+	take_mutex(&state->header_lock);
+
+	unsigned node = 0;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		state->per_node[node].state =
+			(node == modifying_node?OWNER:INVALID);
+	}
+
+	release_mutex(&state->header_lock);
+#ifndef NO_DATA_RW_LOCK
+	release_rw_lock(&state->data_lock);
+#endif
+}
+
+/* NB : this value can only be an indication of the status of a data
+	at some point, but there is no strong garantee ! */
+unsigned is_data_present_or_requested(data_state *state, uint32_t node)
+{
+	unsigned ret = 0;
+
+// XXX : this is just a hint, so we don't take the lock ...
+//	take_mutex(&state->header_lock);
+
+	if (state->per_node[node].state != INVALID 
+		|| state->per_node[node].requested)
+		ret = 1;
+
+//	release_mutex(&state->header_lock);
+
+	return ret;
+}
+
+inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node)
+{
+// XXX : this is just a hint, so we don't take the lock ...
+//	take_mutex(&state->header_lock);
+
+	if (state->per_node[node].state == INVALID) 
+		state->per_node[node].requested = 1;
+
+//	release_mutex(&state->header_lock);
+}

+ 160 - 0
src/datawizard/coherency.h

@@ -0,0 +1,160 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __COHERENCY__H__
+#define __COHERENCY__H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <starpu.h>
+
+#include <starpu-mutex.h>
+#include <common/rwlock.h>
+#include <common/timing.h>
+#include <common/fxt.h>
+#include <common/list.h>
+
+#include <datawizard/data_parameters.h>
+#include <datawizard/data_request.h>
+#include <datawizard/interfaces/data_interface.h>
+#include <datawizard/progress.h>
+#include <datawizard/datastats.h>
+
+typedef enum {
+//	MODIFIED,
+	OWNER,
+	SHARED,
+	INVALID
+} cache_state;
+
+/* this should contain the information relative to a given node */
+typedef struct local_data_state_t {
+	/* describes the state of the local data in term of coherency */
+	cache_state	state; 
+
+	uint32_t refcnt;
+
+	/* is the data locally allocated ? */
+	uint8_t allocated; 
+	/* was it automatically allocated ? */
+	/* perhaps the allocation was perform higher in the hiearchy 
+	 * for now this is just translated into !automatically_allocated
+	 * */
+	uint8_t automatically_allocated;
+
+	/* To help the scheduling policies to make some decision, we
+	   may keep a track of the tasks that are likely to request 
+	   this data on the current node.
+	   It is the responsability of the scheduling _policy_ to set that
+	   flag when it assigns a task to a queue, policies which do not
+	   use this hint can simply ignore it.
+	 */
+	uint8_t requested;
+} local_data_state;
+
+#ifdef NO_DATA_RW_LOCK
+/* Everyone that wants to access some piece of data will post a request.
+ * Not only StarPU internals, but also the application may put such requests */
+
+LIST_TYPE(data_requester,
+	/* what kind of access is requested ? */
+	starpu_access_mode mode;
+
+	unsigned is_requested_by_codelet;
+
+	/* in case this is a codelet that will do the access */
+	struct job_s *j;
+	unsigned buffer_index;
+
+	/* if this is more complicated ... (eg. application request) 
+	 * NB: this callback is not called with the lock taken !
+	 */
+	void (*ready_data_callback)(void *argcb);
+	void *argcb;
+);
+
+#endif
+
+typedef struct starpu_data_state_t {
+#ifdef NO_DATA_RW_LOCK
+	data_requester_list_t req_list;
+	/* the number of requests currently in the scheduling engine
+	 * (not in the req_list anymore) */
+	unsigned refcnt;
+	starpu_access_mode current_mode;
+#else
+	/* protect the data itself */
+	rw_lock	data_lock;
+#endif
+	/* protect meta data */
+	starpu_mutex header_lock;
+
+	uint32_t nnodes; /* the number of memory nodes that may use it */
+	struct starpu_data_state_t *children;
+	int nchildren;
+
+	/* describe the state of the data in term of coherency */
+	local_data_state per_node[MAXNODES];
+
+	/* describe the actual data layout */
+	starpu_data_interface_t interface[MAXNODES];
+
+	struct data_interface_ops_t *ops;
+
+	/* where is the data home ? -1 if none yet */
+	int data_home;
+
+	/* what is the default write-back mask for that data ? */
+	uint32_t wb_mask;
+
+	/* allows special optimization */
+	uint8_t is_readonly;
+
+	/* in some case, the application may explicitly tell StarPU that a
+ 	 * piece of data is not likely to be used soon again */
+	unsigned is_not_important;
+} data_state;
+
+void display_msi_stats(void);
+
+__attribute__((warn_unused_result))
+int fetch_data(data_state *state, starpu_access_mode mode);
+//void release_data(data_state *state, uint32_t write_through_mask);
+
+__attribute__((warn_unused_result))
+int _fetch_data(data_state *state, uint32_t requesting_node, uint8_t read, uint8_t write);
+
+uint32_t get_data_refcnt(data_state *state, uint32_t node);
+
+void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_t mask);
+
+__attribute__((warn_unused_result))
+int fetch_codelet_input(starpu_buffer_descr *descrs, starpu_data_interface_t *interface, unsigned nbuffers, uint32_t mask);
+
+void notify_data_modification(data_state *state, uint32_t modifying_node);
+
+int request_data_allocation(data_state *state, uint32_t node);
+
+unsigned is_data_present_or_requested(data_state *state, uint32_t node);
+
+inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node);
+
+#endif // __COHERENCY__H__

+ 230 - 0
src/datawizard/copy-driver.c

@@ -0,0 +1,230 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <core/policies/sched_policy.h>
+#include <datawizard/datastats.h>
+#include <common/fxt.h>
+#include "copy-driver.h"
+#include "memalloc.h"
+
+mem_node_descr descr;
+static pthread_key_t memory_node_key;
+
+unsigned register_memory_node(node_kind kind)
+{
+	unsigned nnodes;
+	/* ATOMIC_ADD returns the new value ... */
+	nnodes = STARPU_ATOMIC_ADD(&descr.nnodes, 1);
+
+	descr.nodes[nnodes-1] = kind;
+	TRACE_NEW_MEM_NODE(nnodes-1);
+
+	/* for now, there is no queue related to that newly created node */
+	descr.queues_count[nnodes-1] = 0;
+
+	return (nnodes-1);
+}
+
+
+/* TODO move in a more appropriate file */
+/* attach a queue to a memory node */
+void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
+{
+	unsigned nqueues;
+	nqueues = STARPU_ATOMIC_ADD(&descr.queues_count[nodeid], 1);
+
+	descr.attached_queues[nodeid][nqueues-1] = q;
+}
+
+void wake_all_blocked_workers_on_node(unsigned nodeid)
+{
+	/* wake up all queues on that node */
+	unsigned q_id;
+	unsigned nqueues = descr.queues_count[nodeid];
+	for (q_id = 0; q_id < nqueues; q_id++)
+	{
+		struct jobq_s *q;
+		q  = descr.attached_queues[nodeid][q_id];
+
+		/* wake anybody waiting on that queue */
+		pthread_mutex_lock(&q->activity_mutex);
+		pthread_cond_broadcast(&q->activity_cond);
+		pthread_mutex_unlock(&q->activity_mutex);
+	}
+}
+
+void wake_all_blocked_workers(void)
+{
+	/* workers may be blocked on the policy's global condition */
+	struct sched_policy_s *sched = get_sched_policy();
+	pthread_cond_t *sched_cond = &sched->sched_activity_cond;
+	pthread_mutex_t *sched_mutex = &sched->sched_activity_mutex;
+
+	pthread_mutex_lock(sched_mutex);
+	pthread_cond_broadcast(sched_cond);
+	pthread_mutex_unlock(sched_mutex);
+
+	/* workers may be blocked on the various queues' conditions */
+	unsigned node;
+	for (node = 0; node < descr.nnodes; node++)
+	{
+		wake_all_blocked_workers_on_node(node);
+	}
+}
+
+void init_memory_nodes()
+{
+	/* there is no node yet, subsequent nodes will be 
+	 * added using register_memory_node */
+	descr.nnodes = 0;
+
+	pthread_key_create(&memory_node_key, NULL);
+
+	unsigned i;
+	for (i = 0; i < MAXNODES; i++) 
+	{
+		descr.nodes[i] = UNUSED; 
+	}
+
+	init_mem_chunk_lists();
+	init_data_request_lists();
+}
+
+void set_local_memory_node_key(unsigned *node)
+{
+	pthread_setspecific(memory_node_key, node);
+}
+
+unsigned get_local_memory_node(void)
+{
+	unsigned *memory_node;
+	memory_node = pthread_getspecific(memory_node_key);
+	
+	/* in case this is called by the programmer, we assume the RAM node 
+	   is the appropriate memory node ... so we return 0 XXX */
+	if (STARPU_UNLIKELY(!memory_node))
+		return 0;
+
+	return *memory_node;
+}
+
+inline node_kind get_node_kind(uint32_t node)
+{
+	return descr.nodes[node];
+}
+
+int allocate_per_node_buffer(data_state *state, uint32_t node)
+{
+	int ret;
+
+	if (!state->per_node[node].allocated) {
+		/* there is no room available for the data yet */
+		ret = allocate_memory_on_node(state, node);
+		if (STARPU_UNLIKELY(ret == -ENOMEM))
+			goto nomem;
+	}
+
+	return 0;
+nomem:
+	/* there was not enough memory to allocate the buffer */
+	return -ENOMEM;
+}
+
+#ifdef USE_FXT
+/* we need to identify each communication so that we can match the beginning
+ * and the end of a communication in the trace, so we use a unique identifier
+ * per communication */
+static unsigned communication_cnt = 0;
+#endif
+
+int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *state, uint32_t src_node, 
+				uint32_t dst_node, unsigned donotread)
+{
+	int ret_alloc, ret_copy;
+	unsigned __attribute__((unused)) com_id = 0;
+
+	/* first make sure the destination has an allocated buffer */
+	ret_alloc = allocate_per_node_buffer(state, dst_node);
+	if (ret_alloc)
+		goto nomem;
+
+	/* if there is no need to actually read the data, 
+	 * we do not perform any transfer */
+	if (!donotread) {
+		STARPU_ASSERT(state->ops);
+		STARPU_ASSERT(state->ops->copy_data_1_to_1);
+
+#ifdef DATA_STATS
+		size_t size = state->ops->get_size(state);
+		update_comm_ammount(src_node, dst_node, size);
+#endif
+		
+#ifdef USE_FXT
+		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);
+#endif
+
+		/* for now we set the size to 0 in the FxT trace XXX */
+		TRACE_START_DRIVER_COPY(src_node, dst_node, 0, com_id);
+		ret_copy = state->ops->copy_data_1_to_1(state, src_node, dst_node);
+		TRACE_END_DRIVER_COPY(src_node, dst_node, 0, com_id);
+
+		return ret_copy;
+	}
+
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+static uint32_t choose_src_node(uint32_t src_node_mask)
+{
+	unsigned src_node = 0;
+	unsigned i;
+
+	/* first find the node that will be the actual source */
+	for (i = 0; i < MAXNODES; i++)
+	{
+		if (src_node_mask & (1<<i))
+		{
+			/* this is a potential candidate */
+			src_node = i;
+
+			/* however GPU are expensive sources, really !
+			 * 	other should be ok */
+			if (descr.nodes[i] != CUDA_RAM)
+				break;
+
+			/* XXX do a better algorithm to distribute the memory copies */
+		}
+	}
+
+	return src_node;
+}
+
+__attribute__((warn_unused_result))
+int driver_copy_data(data_state *state, uint32_t src_node_mask,
+			 uint32_t dst_node, unsigned donotread)
+{
+	int ret;
+	uint32_t src_node = choose_src_node(src_node_mask);
+
+	/* possibly returns -1 if there was no memory left */
+	ret = driver_copy_data_1_to_1(state, src_node, dst_node, donotread);
+
+	return ret;
+}

+ 67 - 0
src/datawizard/copy-driver.h

@@ -0,0 +1,67 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __COPY_DRIVER_H__
+#define __COPY_DRIVER_H__
+
+#include "coherency.h"
+#include "memalloc.h"
+
+#ifdef USE_CUDA
+#include <cublas.h>
+#endif
+
+
+typedef enum {
+	UNUSED,
+	SPU_LS,
+	RAM,
+	CUDA_RAM
+} node_kind;
+
+typedef struct {
+	unsigned nnodes;
+	node_kind nodes[MAXNODES];
+
+	/* the list of queues that are attached to a given node */
+	// XXX 32 is set randomly !
+	struct jobq_s *attached_queues[MAXNODES][32];
+	/* the number of queues attached to each node */
+	unsigned queues_count[MAXNODES];
+} mem_node_descr;
+
+struct starpu_data_state_t;
+
+__attribute__((warn_unused_result))
+int driver_copy_data(struct starpu_data_state_t *state, uint32_t src_node_mask, uint32_t dst_node, unsigned donotread);
+
+void init_memory_nodes(void);
+void set_local_memory_node_key(unsigned *node);
+unsigned get_local_memory_node(void);
+unsigned register_memory_node(node_kind kind);
+void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid);
+void wake_all_blocked_workers(void);
+void wake_all_blocked_workers_on_node(unsigned nodeid);
+
+node_kind get_node_kind(uint32_t node);
+
+__attribute__((warn_unused_result))
+int driver_copy_data_1_to_1(struct starpu_data_state_t *state, uint32_t node, 
+				uint32_t requesting_node, unsigned donotread);
+
+int allocate_per_node_buffer(struct starpu_data_state_t *state, uint32_t node);
+
+#endif // __COPY_DRIVER_H__

+ 22 - 0
src/datawizard/data_parameters.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATA_PARAMETERS_H__
+#define __DATA_PARAMETERS_H__
+
+#define MAXNODES	6
+
+#endif // __DATA_PARAMETERS_H__

+ 111 - 0
src/datawizard/data_request.c

@@ -0,0 +1,111 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/data_request.h>
+
+static data_request_list_t data_requests[MAXNODES];
+static starpu_mutex data_requests_mutex[MAXNODES];
+
+void init_data_request_lists(void)
+{
+	unsigned i;
+	for (i = 0; i < MAXNODES; i++)
+	{
+		data_requests[i] = data_request_list_new();
+		init_mutex(&data_requests_mutex[i]);
+	}
+}
+
+int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	int retvalue;
+
+	data_request_t r = data_request_new();
+
+	r->state = state;
+	r->src_node = src_node;
+	r->dst_node = dst_node;
+	sem_init(&r->sem, 0, 0);
+
+	/* insert the request in the proper list */
+	take_mutex(&data_requests_mutex[src_node]);
+	data_request_list_push_front(data_requests[src_node], r);
+	release_mutex(&data_requests_mutex[src_node]);
+
+	/* wake the threads that could perform that operation */
+	wake_all_blocked_workers_on_node(src_node);
+
+	/* wait for the request to be performed */
+	//sem_wait(&r->sem);
+	//while(sem_trywait(&r->sem) == -1)
+	//	wake_all_blocked_workers_on_node(src_node);
+
+#ifdef NO_DATA_RW_LOCK
+	/* XXX: since there is no concurrency on this data (we don't use the
+	 * rw-lock) we can assume that the data on the source node should not
+	 * be invalidated.
+	 * TODO: handle the situation of a possible invalidation caused by
+	 * memory eviction mechanism. This could be done by the means of a
+	 * specific state (or flag) in the MSI protocol. */
+	release_mutex(&state->header_lock);
+#endif
+
+	while(sem_trywait(&r->sem) == -1)
+	{
+		wake_all_blocked_workers_on_node(src_node);
+		datawizard_progress(dst_node);
+	}
+
+#ifdef NO_DATA_RW_LOCK
+	take_mutex(&state->header_lock);
+#endif
+
+
+	retvalue = r->retval;
+	
+	/* the request is useless now */
+	data_request_delete(r);
+
+	return retvalue;	
+}
+
+void handle_node_data_requests(uint32_t src_node)
+{
+	take_mutex(&data_requests_mutex[src_node]);
+
+	/* for all entries of the list */
+	data_request_list_t l = data_requests[src_node];
+	data_request_t r;
+
+	while (!data_request_list_empty(l))
+	{
+		r = data_request_list_pop_back(l);		
+		release_mutex(&data_requests_mutex[src_node]);
+
+		/* TODO : accounting to see how much time was spent working for other people ... */
+
+		/* perform the transfer */
+		/* the header of the data must be locked by the worker that submitted the request */
+		r->retval = driver_copy_data_1_to_1(r->state, r->src_node, r->dst_node, 0);
+		
+		/* wake the requesting worker up */
+		sem_post(&r->sem);
+
+		take_mutex(&data_requests_mutex[src_node]);
+	}
+
+	release_mutex(&data_requests_mutex[src_node]);
+}

+ 39 - 0
src/datawizard/data_request.h

@@ -0,0 +1,39 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATA_REQUEST_H__
+#define __DATA_REQUEST_H__
+
+#include <semaphore.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <common/list.h>
+
+struct starpu_data_state_t;
+
+LIST_TYPE(data_request,
+	struct starpu_data_state_t *state;
+	uint32_t src_node;
+	uint32_t dst_node;
+	sem_t sem;
+	int retval;
+);
+
+void init_data_request_lists(void);
+int post_data_request(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+void handle_node_data_requests(uint32_t src_node);
+
+#endif // __DATA_REQUEST_H__

+ 128 - 0
src/datawizard/datastats.c

@@ -0,0 +1,128 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <datawizard/datastats.h>
+#include <common/config.h>
+#include <starpu.h>
+
+/* measure the cache hit ratio for each node */
+
+#ifdef DATA_STATS
+static unsigned hit_cnt[16];
+static unsigned miss_cnt[16];
+#endif
+
+inline void msi_cache_hit(unsigned node __attribute__ ((unused)))
+{
+#ifdef DATA_STATS
+	hit_cnt[node]++;
+#endif
+}
+
+inline void msi_cache_miss(unsigned node __attribute__ ((unused)))
+{
+#ifdef DATA_STATS
+	miss_cnt[node]++;
+#endif
+}
+
+void display_msi_stats(void)
+{
+#ifdef DATA_STATS
+	fprintf(stderr, "MSI cache stats :\n");
+	unsigned node;
+	for (node = 0; node < 4; node++) 
+	{
+		if (hit_cnt[node]+miss_cnt[node]) 
+		{
+			fprintf(stderr, "memory node %d\n", node);
+			fprintf(stderr, "\thit : %u (%2.2f \%%)\n", hit_cnt[node], (100.0f*hit_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
+			fprintf(stderr, "\tmiss : %u (%2.2f \%%)\n", miss_cnt[node], (100.0f*miss_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
+		}
+	}
+#endif
+}
+
+/* measure the efficiency of our allocation cache */
+
+#ifdef DATA_STATS
+static unsigned alloc_cnt[16];
+static unsigned alloc_cache_hit_cnt[16];
+#endif
+
+inline void allocation_cache_hit(unsigned node __attribute__ ((unused)))
+{
+#ifdef DATA_STATS
+	alloc_cache_hit_cnt[node]++;
+#endif
+}
+
+inline void data_allocation_inc_stats(unsigned node __attribute__ ((unused)))
+{
+#ifdef DATA_STATS
+	alloc_cnt[node]++;
+#endif
+}
+
+void display_alloc_cache_stats(void)
+{
+#ifdef DATA_STATS
+	fprintf(stderr, "Allocation cache stats:\n");
+	unsigned node;
+	for (node = 0; node < 4; node++) 
+	{
+		if (alloc_cnt[node]) 
+		{
+			fprintf(stderr, "memory node %d\n", node);
+			fprintf(stderr, "\ttotal alloc : %u\n", alloc_cnt[node]);
+			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n", 
+				alloc_cache_hit_cnt[node], (100.0f*alloc_cache_hit_cnt[node])/(alloc_cnt[node]));
+		}
+	}
+#endif
+}
+
+/* measure the amount of data transfers between each pair of nodes */
+#ifdef DATA_STATS
+
+static size_t comm_ammount[8][8];
+
+void display_comm_ammounts(void)
+{
+	unsigned src, dst;
+
+	for (dst = 0; dst < 8; dst++)
+	for (src = 0; src < 8; src++)
+	{
+		if (comm_ammount[src][dst])
+			fprintf(stderr, "Total comm from %d to %d \t%dMB\n", src, dst, ((unsigned)comm_ammount[src][dst])/(1024*1024));
+	}
+}
+
+inline void update_comm_ammount(uint32_t src_node, uint32_t dst_node, size_t size)
+{
+	comm_ammount[src_node][dst_node] += size;
+}
+
+#else
+
+inline void display_comm_ammounts(void)
+{
+}
+
+#endif
+

+ 40 - 0
src/datawizard/datastats.h

@@ -0,0 +1,40 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATASTATS_H__
+#define __DATASTATS_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+
+
+inline void msi_cache_hit(unsigned node);
+inline void msi_cache_miss(unsigned node);
+
+void display_msi_stats(void);
+
+inline void allocation_cache_hit(unsigned node __attribute__ ((unused)));
+inline void data_allocation_inc_stats(unsigned node __attribute__ ((unused)));
+
+
+void display_comm_ammounts(void);
+void display_alloc_cache_stats(void);
+
+#ifdef DATA_STATS
+inline void update_comm_ammount(uint32_t src_node, uint32_t dst_node, size_t size);
+#endif
+
+#endif

+ 41 - 0
src/datawizard/datawizard.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATAWIZARD_H__
+#define __DATAWIZARD_H__
+
+#include <datawizard/coherency.h>
+#include <datawizard/hierarchy.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/footprint.h>
+
+#include <datawizard/progress.h>
+#include <datawizard/data_request.h>
+
+#include <datawizard/interfaces/data_interface.h>
+
+#include <datawizard/interfaces/blas_interface.h>
+#include <datawizard/interfaces/vector_interface.h>
+#include <datawizard/interfaces/csr_interface.h>
+#include <datawizard/interfaces/csc_interface.h>
+#include <datawizard/interfaces/bcsr_interface.h>
+
+#include <datawizard/interfaces/blas_filters.h>
+#include <datawizard/interfaces/vector_filters.h>
+#include <datawizard/interfaces/csr_filters.h>
+#include <datawizard/interfaces/bcsr_filters.h>
+
+#endif // __DATAWIZARD_H__

+ 45 - 0
src/datawizard/footprint.c

@@ -0,0 +1,45 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/footprint.h>
+
+void compute_buffers_footprint(job_t j)
+{
+	uint32_t footprint = 0;
+	unsigned buffer;
+
+	struct starpu_task *task = j->task;
+
+	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
+	{
+		data_state *state = task->buffers[buffer].state;
+
+		STARPU_ASSERT(state->ops);
+		STARPU_ASSERT(state->ops->footprint);
+
+		footprint = state->ops->footprint(state, footprint);
+	}
+
+	j->footprint = footprint;
+	j->footprint_is_computed = 1;
+}
+
+inline uint32_t compute_data_footprint(data_state *state)
+{
+	uint32_t interfaceid = state->ops->interfaceid;
+
+	return state->ops->footprint(state, interfaceid);
+}

+ 27 - 0
src/datawizard/footprint.h

@@ -0,0 +1,27 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __FOOTPRINT_H__
+#define __FOOTPRINT_H__
+
+#include <core/jobs.h>
+
+struct job_s;
+
+void compute_buffers_footprint(struct job_s *j);
+inline uint32_t compute_data_footprint(data_state *state);
+
+#endif // __FOOTPRINT_H__

+ 327 - 0
src/datawizard/hierarchy.c

@@ -0,0 +1,327 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "hierarchy.h"
+
+/* 
+ * Stop monitoring a data
+ */
+/* TODO : move in a more appropriate file */
+void starpu_delete_data(data_state *state)
+{
+	unsigned node;
+
+	STARPU_ASSERT(state);
+	for (node = 0; node < MAXNODES; node++)
+	{
+		local_data_state *local = &state->per_node[node];
+
+		if (local->allocated && local->automatically_allocated){
+			/* free the data copy in a lazy fashion */
+			request_mem_chunk_removal(state, node);
+		}
+	}
+
+#ifdef NO_DATA_RW_LOCK
+	data_requester_list_delete(state->req_list);
+#endif
+}
+
+void monitor_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
+{
+	STARPU_ASSERT(state);
+
+	/* initialize the new lock */
+#ifndef NO_DATA_RW_LOCK
+	init_rw_lock(&state->data_lock);
+#else
+	state->req_list = data_requester_list_new();
+	state->refcnt = 0;
+#endif
+	init_mutex(&state->header_lock);
+
+	/* first take care to properly lock the data */
+	take_mutex(&state->header_lock);
+
+	/* we assume that all nodes may use that data */
+	state->nnodes = MAXNODES;
+
+	/* there is no hierarchy yet */
+	state->nchildren = 0;
+
+	state->is_not_important = 0;
+
+	/* make sure we do have a valid copy */
+	STARPU_ASSERT(home_node < MAXNODES);
+
+	state->wb_mask = wb_mask;
+
+	/* that new data is invalid from all nodes perpective except for the
+	 * home node */
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		if (node == home_node) {
+			/* this is the home node with the only valid copy */
+			state->per_node[node].state = OWNER;
+			state->per_node[node].allocated = 1;
+			state->per_node[node].automatically_allocated = 0;
+			state->per_node[node].refcnt = 0;
+		}
+		else {
+			/* the value is not available here yet */
+			state->per_node[node].state = INVALID;
+			state->per_node[node].allocated = 0;
+			state->per_node[node].refcnt = 0;
+		}
+	}
+
+	/* now the data is available ! */
+	release_mutex(&state->header_lock);
+}
+
+/*
+ * This function applies a starpu_filter on all the elements of a partition
+ */
+static void map_filter(data_state *root_data, starpu_filter *f)
+{
+	/* we need to apply the starpu_filter on all leaf of the tree */
+	if (root_data->nchildren == 0) 
+	{
+		/* this is a leaf */
+		starpu_partition_data(root_data, f);
+	}
+	else {
+		/* try to apply the starpu_filter recursively */
+		int child;
+		for (child = 0; child < root_data->nchildren; child++)
+		{
+			map_filter(&root_data->children[child], f);
+		}
+	}
+}
+
+void starpu_map_filters(data_state *root_data, unsigned nfilters, ...)
+{
+	unsigned i;
+	va_list pa;
+	va_start(pa, nfilters);
+	for (i = 0; i < nfilters; i++)
+	{
+		starpu_filter *next_filter;
+		next_filter = va_arg(pa, starpu_filter *);
+
+		STARPU_ASSERT(next_filter);
+
+		map_filter(root_data, next_filter);
+	}
+	va_end(pa);
+}
+
+/*
+ * example get_sub_data(data_state *root_data, 3, 42, 0, 1);
+ */
+data_state *get_sub_data(data_state *root_data, unsigned depth, ... )
+{
+	STARPU_ASSERT(root_data);
+	data_state *current_data = root_data;
+
+	/* the variable number of argument must correlate the depth in the tree */
+	unsigned i; 
+	va_list pa;
+	va_start(pa, depth);
+	for (i = 0; i < depth; i++)
+	{
+		unsigned next_child;
+		next_child = va_arg(pa, unsigned);
+
+		STARPU_ASSERT((int)next_child < current_data->nchildren);
+
+		current_data = &current_data->children[next_child];
+	}
+	va_end(pa);
+
+	return current_data;
+}
+
+/*
+ * For now, we assume that partitionned_data is already properly allocated;
+ * at least by the starpu_filter function !
+ */
+void starpu_partition_data(data_state *initial_data, starpu_filter *f)
+{
+	int nparts;
+	int i;
+
+	/* first take care to properly lock the data header */
+	take_mutex(&initial_data->header_lock);
+
+	/* there should not be mutiple filters applied on the same data */
+	STARPU_ASSERT(initial_data->nchildren == 0);
+
+	/* this should update the pointers and size of the chunk */
+	nparts = f->filter_func(f, initial_data);
+	STARPU_ASSERT(nparts > 0);
+
+	initial_data->nchildren = nparts;
+
+	for (i = 0; i < nparts; i++)
+	{
+		data_state *children = &initial_data->children[i];
+
+		STARPU_ASSERT(children);
+
+		children->nchildren = 0;
+
+		children->is_not_important = initial_data->is_not_important;
+
+		/* it is possible that the children does not use the same interface as the parent,
+		 * in that case, the starpu_filter must set the proper methods */
+		if (!children->ops)
+			children->ops = initial_data->ops;
+
+		children->wb_mask = initial_data->wb_mask;
+
+		/* initialize the chunk lock */
+#ifndef NO_DATA_RW_LOCK
+		init_rw_lock(&children->data_lock);
+#else
+		children->req_list = data_requester_list_new();
+		children->refcnt = 0;
+#endif
+		init_mutex(&children->header_lock);
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			children->per_node[node].state = 
+				initial_data->per_node[node].state;
+			children->per_node[node].allocated = 
+				initial_data->per_node[node].allocated;
+			children->per_node[node].automatically_allocated = initial_data->per_node[node].automatically_allocated;
+			children->per_node[node].refcnt = 0;
+		}
+	}
+
+	/* now let the header */
+	release_mutex(&initial_data->header_lock);
+}
+
+void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
+{
+	int child;
+	unsigned node;
+
+	take_mutex(&root_data->header_lock);
+
+#ifdef NO_DATA_RW_LOCK
+#warning starpu_unpartition_data is not supported with NO_DATA_RW_LOCK yet ...
+#endif
+
+	/* first take all the children lock (in order !) */
+	for (child = 0; child < root_data->nchildren; child++)
+	{
+		/* make sure the intermediate children is unpartitionned as well */
+		if (root_data->children[child].nchildren > 0)
+			starpu_unpartition_data(&root_data->children[child], gathering_node);
+
+		int ret;
+		ret = _fetch_data(&root_data->children[child], gathering_node, 1, 0);
+		/* for now we pretend that the RAM is almost unlimited and that gathering 
+		 * data should be possible from the node that does the unpartionning ... we
+		 * don't want to have the programming deal with memory shortage at that time,
+		 * really */
+		STARPU_ASSERT(ret == 0); 
+	}
+
+	/* the gathering_node should now have a valid copy of all the children.
+	 * For all nodes, if the node had all copies and none was locally
+	 * allocated then the data is still valid there, else, it's invalidated
+	 * for the gathering node, if we have some locally allocated data, we 
+	 * copy all the children (XXX this should not happen so we just do not
+	 * do anything since this is transparent ?) */
+	unsigned still_valid[MAXNODES];
+
+	/* we do 2 passes : the first pass determines wether the data is still
+	 * valid or not, the second pass is needed to choose between SHARED and
+	 * OWNER */
+
+	unsigned nvalids = 0;
+
+	/* still valid ? */
+	for (node = 0; node < MAXNODES; node++)
+	{
+		/* until an issue is found the data is assumed to be valid */
+		unsigned isvalid = 1;
+
+		for (child = 0; child < root_data->nchildren; child++)
+		{
+			local_data_state *local = &root_data->children[child].per_node[node];
+
+			if (local->state == INVALID) {
+				isvalid = 0; 
+			}
+	
+			if (local->allocated && local->automatically_allocated){
+				/* free the data copy in a lazy fashion */
+				request_mem_chunk_removal(root_data, node);
+				isvalid = 0; 
+			}
+		}
+
+		/* no problem was found so the node still has a valid copy */
+		still_valid[node] = isvalid;
+		nvalids++;
+	}
+
+	/* either shared or owned */
+	STARPU_ASSERT(nvalids > 0);
+
+	cache_state newstate = (nvalids == 1)?OWNER:SHARED;
+
+	for (node = 0; node < MAXNODES; node++)
+	{
+		root_data->per_node[node].state = 
+			still_valid[node]?newstate:INVALID;
+	}
+
+	/* there is no child anymore */
+	root_data->nchildren = 0;
+
+	/* now the parent may be used again so we release the lock */
+	release_mutex(&root_data->header_lock);
+}
+
+void starpu_advise_if_data_is_important(data_state *state, unsigned is_important)
+{
+
+	take_mutex(&state->header_lock);
+
+	/* first take all the children lock (in order !) */
+	int child;
+	for (child = 0; child < state->nchildren; child++)
+	{
+		/* make sure the intermediate children is advised as well */
+		if (state->children[child].nchildren > 0)
+			starpu_advise_if_data_is_important(&state->children[child], is_important);
+	}
+
+	state->is_not_important = !is_important;
+
+	/* now the parent may be used again so we release the lock */
+	release_mutex(&state->header_lock);
+
+}

+ 28 - 0
src/datawizard/hierarchy.h

@@ -0,0 +1,28 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __HIERARCHY_H__
+#define __HIERARCHY_H__
+
+#include <stdarg.h>
+#include <datawizard/coherency.h>
+#include <datawizard/memalloc.h>
+
+#include <starpu.h>
+
+void monitor_new_data(struct starpu_data_state_t *state, uint32_t home_node, uint32_t wb_mask);
+
+#endif

+ 29 - 0
src/datawizard/interfaces/Makefile

@@ -0,0 +1,29 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+OBJS := bcsr_interface.o csr_interface.o blas_filters.o blas_interface.o vector_interface.o bcsr_filters.o csr_filters.o vector_filters.o
+
+all: $(OBJS)
+
+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
+%.d: %.c
+	$(CC) $(CFLAGS) $< -MM -o $*.d
+
+-include $(OBJS:.o=.d)
+endif
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda

+ 77 - 0
src/datawizard/interfaces/bcsr_filters.c

@@ -0,0 +1,77 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "bcsr_filters.h"
+#include "bcsr_interface.h"
+#include "blas_filters.h"
+#include "blas_interface.h"
+
+extern struct data_interface_ops_t interface_blas_ops;
+
+unsigned starpu_canonical_block_filter_bcsr(starpu_filter *f __attribute__((unused)), data_state *root_data)
+{
+	unsigned nchunks;
+
+	uint32_t nnz = root_data->interface[0].bcsr.nnz;
+
+	size_t elemsize = root_data->interface[0].bcsr.elemsize;
+	uint32_t firstentry = root_data->interface[0].bcsr.firstentry;
+
+	/* size of the tiles */
+	uint32_t r = root_data->interface[0].bcsr.r;
+	uint32_t c = root_data->interface[0].bcsr.c;
+
+	/* we create as many subdata as there are blocks ... */
+	nchunks = nnz;
+	
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	/* actually create all the chunks */
+
+	/* XXX */
+	STARPU_ASSERT(root_data->per_node[0].allocated);
+
+	/* each chunk becomes a small dense matrix */
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t ptr_offset = c*r*chunk*elemsize;
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
+
+			local->nx = c;
+			local->ny = r;
+			local->ld = c;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				uint8_t *nzval = (uint8_t *)(root_data->interface[node].bcsr.nzval);
+				local->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
+			}
+		}
+
+		struct starpu_data_state_t *state = &root_data->children[chunk];
+		state->ops = &interface_blas_ops;
+	}
+
+	return nchunks;
+
+}

+ 22 - 0
src/datawizard/interfaces/bcsr_filters.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BCSR_FILTERS_H__
+#define __BCSR_FILTERS_H__
+
+#include <datawizard/hierarchy.h>
+
+#endif // __BCSR_FILTERS_H__

+ 491 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -0,0 +1,491 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/data_parameters.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/hierarchy.h>
+#include <starpu.h>
+
+#include <common/hash.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+/*
+ * BCSR : blocked CSR, we use blocks of size (r x c)
+ */
+size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
+void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
+size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer);
+int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+size_t bcsr_interface_get_size(struct starpu_data_state_t *state);
+uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate);
+
+struct data_interface_ops_t interface_bcsr_ops = {
+	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
+	.liberate_data_on_node = liberate_bcsr_buffer_on_node,
+	.copy_data_1_to_1 = do_copy_bcsr_buffer_1_to_1,
+	.dump_data_interface = dump_bcsr_interface,
+	.get_size = bcsr_interface_get_size,
+	.interfaceid = BCSR_INTERFACE,
+	.footprint = footprint_bcsr_interface_crc32
+};
+
+void starpu_monitor_bcsr_data(struct starpu_data_state_t **handle, uint32_t home_node,
+		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry,  uint32_t r, uint32_t c, size_t elemsize)
+{
+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
+	STARPU_ASSERT(state);
+
+	STARPU_ASSERT(handle);
+	*handle = state;
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_bcsr_interface_t *local_interface = &state->interface[node].bcsr;
+
+		if (node == home_node) {
+			local_interface->nzval = nzval;
+			local_interface->colind = colind;
+			local_interface->rowptr = rowptr;
+		}
+		else {
+			local_interface->nzval = 0;
+			local_interface->colind = NULL;
+			local_interface->rowptr = NULL;
+		}
+
+		local_interface->nnz = nnz;
+		local_interface->nrow = nrow;
+		local_interface->firstentry = firstentry;
+		local_interface->r = r;
+		local_interface->c = c;
+		local_interface->elemsize = elemsize;
+	}
+
+	state->ops = &interface_bcsr_ops;
+
+	monitor_new_data(state, home_node, 0);
+}
+
+static inline uint32_t footprint_bcsr_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
+{
+	uint32_t hash;
+
+	hash = hstate;
+	hash = hash_func(starpu_get_bcsr_nnz(state), hash);
+	hash = hash_func(starpu_get_bcsr_c(state), hash);
+	hash = hash_func(starpu_get_bcsr_r(state), hash);
+
+	return hash;
+}
+
+uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate)
+{
+	return footprint_bcsr_interface_generic(crc32_be, state, hstate);
+}
+
+
+
+struct dumped_bcsr_interface_s {
+	uint32_t nnz;
+	uint32_t nrow;
+	uintptr_t nzval;
+	uint32_t *colind;
+	uint32_t *rowptr;
+	uint32_t firstentry;
+	uint32_t r;
+	uint32_t c;
+	uint32_t elemsize;
+}  __attribute__ ((packed));
+
+size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer)
+{
+	/* yes, that's DIRTY ... */
+	struct dumped_bcsr_interface_s *buffer = _buffer;
+
+	buffer->nnz = (*interface).bcsr.nnz;
+	buffer->nrow = (*interface).bcsr.nrow;
+	buffer->nzval = (*interface).bcsr.nzval;
+	buffer->colind = (*interface).bcsr.colind;
+	buffer->rowptr = (*interface).bcsr.rowptr;
+	buffer->firstentry = (*interface).bcsr.firstentry;
+	buffer->r = (*interface).bcsr.r;
+	buffer->c = (*interface).bcsr.c;
+	buffer->elemsize = (*interface).bcsr.elemsize;
+
+	return (sizeof(struct dumped_bcsr_interface_s));
+}
+
+/* offer an access to the data parameters */
+uint32_t starpu_get_bcsr_nnz(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.nnz);
+}
+
+uint32_t starpu_get_bcsr_nrow(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.nrow);
+}
+
+uint32_t starpu_get_bcsr_firstentry(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.firstentry);
+}
+
+uint32_t starpu_get_bcsr_r(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.r);
+}
+
+uint32_t starpu_get_bcsr_c(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.c);
+}
+
+size_t starpu_get_bcsr_elemsize(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].bcsr.elemsize);
+}
+
+uintptr_t starpu_get_bcsr_local_nzval(struct starpu_data_state_t *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].bcsr.nzval);
+}
+
+uint32_t *starpu_get_bcsr_local_colind(struct starpu_data_state_t *state)
+{
+//	unsigned node;
+//	node = get_local_memory_node();
+//
+//	STARPU_ASSERT(state->per_node[node].allocated);
+//
+//	return (state->interface[node].bcsr.colind);
+
+	/* XXX */
+	return (state->interface[0].bcsr.colind);
+}
+
+uint32_t *starpu_get_bcsr_local_rowptr(struct starpu_data_state_t *state)
+{
+//	unsigned node;
+//	node = get_local_memory_node();
+//
+//	STARPU_ASSERT(state->per_node[node].allocated);
+//
+//	return (state->interface[node].bcsr.rowptr);
+	
+	/* XXX */
+	return (state->interface[0].bcsr.rowptr);
+}
+
+
+size_t bcsr_interface_get_size(struct starpu_data_state_t *state)
+{
+	size_t size;
+
+	uint32_t nnz = starpu_get_bcsr_nnz(state);
+	uint32_t nrow = starpu_get_bcsr_nrow(state);
+	uint32_t r = starpu_get_bcsr_r(state);
+	uint32_t c = starpu_get_bcsr_c(state);
+	size_t elemsize = starpu_get_bcsr_elemsize(state);
+
+	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t); 
+
+	return size;
+}
+
+
+/* memory allocation/deallocation primitives for the BLAS interface */
+
+/* returns the size of the allocated area */
+size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node)
+{
+	uintptr_t addr_nzval;
+	uint32_t *addr_colind, *addr_rowptr;
+	size_t allocated_memory;
+
+	/* we need the 3 arrays to be allocated */
+
+	uint32_t nnz = state->interface[dst_node].bcsr.nnz;
+	uint32_t nrow = state->interface[dst_node].bcsr.nrow;
+	size_t elemsize = state->interface[dst_node].bcsr.elemsize;
+
+	uint32_t r = state->interface[dst_node].bcsr.r;
+	uint32_t c = state->interface[dst_node].bcsr.c;
+
+	node_kind kind = get_node_kind(dst_node);
+
+	switch(kind) {
+		case RAM:
+			addr_nzval = (uintptr_t)malloc(nnz*r*c*elemsize);
+			if (!addr_nzval)
+				goto fail_nzval;
+
+			addr_colind = malloc(nnz*sizeof(uint32_t));
+			if (!addr_colind)
+				goto fail_colind;
+
+			addr_rowptr = malloc((nrow+1)*sizeof(uint32_t));
+			if (!addr_rowptr)
+				goto fail_rowptr;
+
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasAlloc(nnz*r*c, elemsize, (void **)&addr_nzval);
+			if (!addr_nzval)
+				goto fail_nzval;
+
+			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
+			if (!addr_colind)
+				goto fail_colind;
+
+			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
+			if (!addr_rowptr)
+				goto fail_rowptr;
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+	/* allocation succeeded */
+	allocated_memory = 
+		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
+
+	/* update the data properly in consequence */
+	state->interface[dst_node].bcsr.nzval = addr_nzval;
+	state->interface[dst_node].bcsr.colind = addr_colind;
+	state->interface[dst_node].bcsr.rowptr = addr_rowptr;
+	
+	return allocated_memory;
+
+fail_rowptr:
+	switch(kind) {
+		case RAM:
+			free((void *)addr_colind);
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)addr_colind);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+fail_colind:
+	switch(kind) {
+		case RAM:
+			free((void *)addr_nzval);
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)addr_nzval);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+fail_nzval:
+
+	/* allocation failed */
+	allocated_memory = 0;
+
+	return allocated_memory;
+}
+
+void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
+{
+	node_kind kind = get_node_kind(node);
+	switch(kind) {
+		case RAM:
+			free((void*)interface->bcsr.nzval);
+			free((void*)interface->bcsr.colind);
+			free((void*)interface->bcsr.rowptr);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)interface->bcsr.nzval);
+			cublasFree((void*)interface->bcsr.colind);
+			cublasFree((void*)interface->bcsr.rowptr);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+#ifdef USE_CUDA
+static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = &state->interface[src_node].bcsr;
+	dst_bcsr = &state->interface[dst_node].bcsr;
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+	cublasGetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
+			 		   (uint8_t *)dst_bcsr->nzval, 1);
+
+	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
+						(uint8_t *)dst_bcsr->colind, 1);
+
+	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
+						(uint8_t *)dst_bcsr->rowptr, 1);
+	
+	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+}
+
+static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = &state->interface[src_node].bcsr;
+	dst_bcsr = &state->interface[dst_node].bcsr;
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+	cublasSetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
+					(uint8_t *)dst_bcsr->nzval, 1);
+
+	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
+						(uint8_t *)dst_bcsr->colind, 1);
+
+	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
+						(uint8_t *)dst_bcsr->rowptr, 1);
+	
+	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+}
+#endif // USE_CUDA
+
+/* as not all platform easily have a BLAS lib installed ... */
+static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+
+	starpu_bcsr_interface_t *src_bcsr;
+	starpu_bcsr_interface_t *dst_bcsr;
+
+	src_bcsr = &state->interface[src_node].bcsr;
+	dst_bcsr = &state->interface[dst_node].bcsr;
+
+	uint32_t nnz = src_bcsr->nnz;
+	uint32_t nrow = src_bcsr->nrow;
+	size_t elemsize = src_bcsr->elemsize;
+
+	uint32_t r = src_bcsr->r;
+	uint32_t c = src_bcsr->c;
+
+	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
+
+	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
+
+	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
+
+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
+}
+
+
+int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				if (get_local_memory_node() == src_node)
+				{
+					copy_cublas_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					post_data_request(state, src_node, dst_node);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_ram_to_cublas(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}

+ 26 - 0
src/datawizard/interfaces/bcsr_interface.h

@@ -0,0 +1,26 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BCSR_INTERFACE_H__
+#define __BCSR_INTERFACE_H__
+
+#include <stdint.h>
+
+/* this interface is used for Sparse matrices */
+
+#define BCSR_INTERFACE	0x118504
+
+#endif // __BCSR_INTERFACE_H__

+ 113 - 0
src/datawizard/interfaces/blas_filters.c

@@ -0,0 +1,113 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "blas_filters.h"
+#include "blas_interface.h"
+
+/*
+ * an example of a dummy partition function : blocks ...
+ */
+unsigned starpu_block_filter_func(starpu_filter *f, data_state *root_data)
+{
+	unsigned nchunks;
+	uint32_t arg = f->filter_arg;
+
+	starpu_blas_interface_t *blas_root = &root_data->interface[0].blas;
+	uint32_t nx = blas_root->nx;
+	uint32_t ny = blas_root->ny;
+	size_t elemsize = blas_root->elemsize;
+
+	/* we will have arg chunks */
+	nchunks = STARPU_MIN(nx, arg);
+
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	/* actually create all the chunks */
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
+		size_t offset = chunk*chunk_size*elemsize;
+
+		uint32_t child_nx = 
+			STARPU_MIN(chunk_size, nx - chunk*chunk_size);
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
+
+			local->nx = child_nx;
+			local->ny = ny;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				local->ptr = root_data->interface[node].blas.ptr + offset;
+				local->ld = root_data->interface[node].blas.ld;
+			}
+		}
+	}
+
+	return nchunks;
+}
+
+unsigned starpu_vertical_block_filter_func(starpu_filter *f, data_state *root_data)
+{
+	unsigned nchunks;
+	uint32_t arg = f->filter_arg;
+
+	uint32_t nx = root_data->interface[0].blas.nx;
+	uint32_t ny = root_data->interface[0].blas.ny;
+	size_t elemsize = root_data->interface[0].blas.elemsize;
+
+	/* we will have arg chunks */
+	nchunks = STARPU_MIN(ny, arg);
+	
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	/* actually create all the chunks */
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t chunk_size = (ny + nchunks - 1)/nchunks;
+
+		uint32_t child_ny = 
+			STARPU_MIN(chunk_size, ny - chunk*chunk_size);
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
+
+			local->nx = nx;
+			local->ny = child_ny;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				size_t offset = 
+					chunk*chunk_size*root_data->interface[node].blas.ld*elemsize;
+				local->ptr = root_data->interface[node].blas.ptr + offset;
+				local->ld = root_data->interface[node].blas.ld;
+			}
+		}
+	}
+
+	return nchunks;
+}

+ 22 - 0
src/datawizard/interfaces/blas_filters.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_FILTERS_H__
+#define __BLAS_FILTERS_H__
+
+#include <datawizard/hierarchy.h>
+
+#endif // __BLAS_FILTERS_H__

+ 413 - 0
src/datawizard/interfaces/blas_interface.c

@@ -0,0 +1,413 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/data_parameters.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/hierarchy.h>
+
+#include <common/hash.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node);
+void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
+int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
+size_t dump_blas_interface(starpu_data_interface_t *interface, void *buffer);
+size_t blas_interface_get_size(struct starpu_data_state_t *state);
+uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate);
+void display_blas_interface(data_state *state, FILE *f);
+#ifdef USE_GORDON
+int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+#endif
+
+struct data_interface_ops_t interface_blas_ops = {
+	.allocate_data_on_node = allocate_blas_buffer_on_node,
+	.liberate_data_on_node = liberate_blas_buffer_on_node,
+	.copy_data_1_to_1 = do_copy_blas_buffer_1_to_1,
+	.dump_data_interface = dump_blas_interface,
+	.get_size = blas_interface_get_size,
+	.footprint = footprint_blas_interface_crc32,
+#ifdef USE_GORDON
+	.convert_to_gordon = convert_blas_to_gordon,
+#endif
+	.interfaceid = BLAS_INTERFACE, 
+	.display = display_blas_interface
+};
+
+#ifdef USE_GORDON
+int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+{
+	STARPU_ASSERT(gordon_interface);
+
+	size_t elemsize = (*interface).blas.elemsize;
+	uint32_t nx = (*interface).blas.nx;
+	uint32_t ny = (*interface).blas.ny;
+	uint32_t ld = (*interface).blas.ld;
+
+	*ptr = (*interface).blas.ptr;
+
+	/* The gordon_stride_init function may use a contiguous buffer
+ 	 * in case nx = ld (in that case, (*ss).size = elemsize*nx*ny */
+	*ss = gordon_stride_init(ny, nx*elemsize, ld*elemsize);
+
+	return 0;
+}
+#endif
+
+/* declare a new data with the BLAS interface */
+void starpu_monitor_blas_data(struct starpu_data_state_t **handle, uint32_t home_node,
+			uintptr_t ptr, uint32_t ld, uint32_t nx,
+			uint32_t ny, size_t elemsize)
+{
+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
+	STARPU_ASSERT(state);
+
+	STARPU_ASSERT(handle);
+	*handle = state;
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_blas_interface_t *local_interface = &state->interface[node].blas;
+
+		if (node == home_node) {
+			local_interface->ptr = ptr;
+			local_interface->ld  = ld;
+		}
+		else {
+			local_interface->ptr = 0;
+			local_interface->ld  = 0;
+		}
+
+		local_interface->nx = nx;
+		local_interface->ny = ny;
+		local_interface->elemsize = elemsize;
+	}
+
+	state->ops = &interface_blas_ops;
+
+	monitor_new_data(state, home_node, 0);
+}
+
+static inline uint32_t footprint_blas_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
+{
+	uint32_t hash;
+
+	hash = hstate;
+	hash = hash_func(starpu_get_blas_nx(state), hash);
+	hash = hash_func(starpu_get_blas_ny(state), hash);
+
+	return hash;
+}
+
+uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate)
+{
+	return footprint_blas_interface_generic(crc32_be, state, hstate);
+}
+
+struct dumped_blas_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	uint32_t ny;
+	uint32_t ld;
+} __attribute__ ((packed));
+
+void display_blas_interface(data_state *state, FILE *f)
+{
+	starpu_blas_interface_t *interface;
+
+	interface = &state->interface[0].blas;
+
+	fprintf(f, "%d\t%d\t", interface->nx, interface->ny);
+}
+
+size_t dump_blas_interface(starpu_data_interface_t *interface, void *_buffer)
+{
+	/* yes, that's DIRTY ... */
+	struct dumped_blas_interface_s *buffer = _buffer;
+
+	buffer->ptr = (*interface).blas.ptr;
+	buffer->nx = (*interface).blas.nx;
+	buffer->ny = (*interface).blas.ny;
+	buffer->ld = (*interface).blas.ld;
+
+	return (sizeof(struct dumped_blas_interface_s));
+}
+
+size_t blas_interface_get_size(struct starpu_data_state_t *state)
+{
+	size_t size;
+	starpu_blas_interface_t *interface;
+
+	interface = &state->interface[0].blas;
+
+	size = interface->nx*interface->ny*interface->elemsize; 
+
+	return size;
+}
+
+/* offer an access to the data parameters */
+uint32_t starpu_get_blas_nx(data_state *state)
+{
+	return (state->interface[0].blas.nx);
+}
+
+uint32_t starpu_get_blas_ny(data_state *state)
+{
+	return (state->interface[0].blas.ny);
+}
+
+uint32_t starpu_get_blas_local_ld(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].blas.ld);
+}
+
+uintptr_t starpu_get_blas_local_ptr(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].blas.ptr);
+}
+
+/* memory allocation/deallocation primitives for the BLAS interface */
+
+/* returns the size of the allocated area */
+size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node)
+{
+	uintptr_t addr = 0;
+	unsigned fail = 0;
+	size_t allocated_memory;
+
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+	uint32_t nx = state->interface[dst_node].blas.nx;
+	uint32_t ny = state->interface[dst_node].blas.ny;
+	size_t elemsize = state->interface[dst_node].blas.elemsize;
+
+	node_kind kind = get_node_kind(dst_node);
+
+	switch(kind) {
+		case RAM:
+			addr = (uintptr_t)malloc(nx*ny*elemsize);
+			if (!addr) 
+				fail = 1;
+
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			status = cublasAlloc(nx*ny, elemsize, (void **)&addr);
+
+			if (!addr || status != CUBLAS_STATUS_SUCCESS)
+			{
+				STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
+				STARPU_ASSERT(status != CUBLAS_STATUS_NOT_INITIALIZED);
+				STARPU_ASSERT(status != CUBLAS_STATUS_INVALID_VALUE);
+				STARPU_ASSERT(status == CUBLAS_STATUS_ALLOC_FAILED);
+				fail = 1;
+			}
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+	if (!fail) {
+		/* allocation succeeded */
+		allocated_memory = nx*ny*elemsize;
+
+		/* update the data properly in consequence */
+		state->interface[dst_node].blas.ptr = addr;
+		state->interface[dst_node].blas.ld = nx;
+	} else {
+		/* allocation failed */
+		allocated_memory = 0;
+	}
+	
+	return allocated_memory;
+}
+
+void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
+{
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+
+	node_kind kind = get_node_kind(node);
+	switch(kind) {
+		case RAM:
+			free((void*)interface->blas.ptr);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			status = cublasFree((void*)interface->blas.ptr);
+			
+			STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
+			STARPU_ASSERT(status == CUBLAS_STATUS_SUCCESS);
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+#ifdef USE_CUDA
+static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_blas_interface_t *src_blas;
+	starpu_blas_interface_t *dst_blas;
+
+	src_blas = &state->interface[src_node].blas;
+	dst_blas = &state->interface[dst_node].blas;
+
+	cublasGetMatrix(src_blas->nx, src_blas->ny, src_blas->elemsize,
+		(uint8_t *)src_blas->ptr, src_blas->ld,
+		(uint8_t *)dst_blas->ptr, dst_blas->ld);
+
+	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
+}
+
+static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_blas_interface_t *src_blas;
+	starpu_blas_interface_t *dst_blas;
+
+	src_blas = &state->interface[src_node].blas;
+	dst_blas = &state->interface[dst_node].blas;
+
+
+	cublasSetMatrix(src_blas->nx, src_blas->ny, src_blas->elemsize,
+		(uint8_t *)src_blas->ptr, src_blas->ld,
+		(uint8_t *)dst_blas->ptr, dst_blas->ld);
+
+	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
+}
+#endif // USE_CUDA
+
+/* as not all platform easily have a BLAS lib installed ... */
+static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	unsigned y;
+	uint32_t nx = state->interface[dst_node].blas.nx;
+	uint32_t ny = state->interface[dst_node].blas.ny;
+	size_t elemsize = state->interface[dst_node].blas.elemsize;
+
+	uint32_t ld_src = state->interface[src_node].blas.ld;
+	uint32_t ld_dst = state->interface[dst_node].blas.ld;
+
+	uintptr_t ptr_src = state->interface[src_node].blas.ptr;
+	uintptr_t ptr_dst = state->interface[dst_node].blas.ptr;
+
+
+	for (y = 0; y < ny; y++)
+	{
+		uint32_t src_offset = y*ld_src*elemsize;
+		uint32_t dst_offset = y*ld_dst*elemsize;
+
+		memcpy((void *)(ptr_dst + dst_offset), 
+			(void *)(ptr_src + src_offset), nx*elemsize);
+	}
+
+	TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+}
+
+
+int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				if (get_local_memory_node() == src_node)
+				{
+					/* only the proper CUBLAS thread can initiate this directly ! */
+					copy_cublas_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					/* put a request to the corresponding GPU */
+		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
+					post_data_request(state, src_node, dst_node);
+		//			fprintf(stderr, "post %p OK\n", state);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_ram_to_cublas(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+

+ 24 - 0
src/datawizard/interfaces/blas_interface.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_INTERFACE_H__
+#define __BLAS_INTERFACE_H__
+
+#include <stdint.h>
+
+#define BLAS_INTERFACE   0x118501
+
+#endif // __BLAS_INTERFACE_H__

+ 24 - 0
src/datawizard/interfaces/csc_interface.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __CSC_INTERFACE_H__
+#define __CSC_INTERFACE_H__
+
+/* this interface is used for Sparse matrices */
+
+#define CSC_INTERFACE	0x118505
+
+#endif // __CSC_INTERFACE_H__

+ 74 - 0
src/datawizard/interfaces/csr_filters.c

@@ -0,0 +1,74 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "csr_filters.h"
+#include "csr_interface.h"
+
+unsigned starpu_vertical_block_filter_func_csr(starpu_filter *f, data_state *root_data)
+{
+	unsigned nchunks;
+	uint32_t arg = f->filter_arg;
+
+	uint32_t nrow = root_data->interface[0].csr.nrow;
+	size_t elemsize = root_data->interface[0].csr.elemsize;
+	uint32_t firstentry = root_data->interface[0].csr.firstentry;
+
+	/* we will have arg chunks */
+	nchunks = STARPU_MIN(nrow, arg);
+	
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	/* actually create all the chunks */
+	uint32_t chunk_size = (nrow + nchunks - 1)/nchunks;
+
+	/* XXX */
+	STARPU_ASSERT(root_data->per_node[0].allocated);
+	uint32_t *rowptr = root_data->interface[0].csr.rowptr;
+
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t first_index = chunk*chunk_size - firstentry;
+		uint32_t local_firstentry = rowptr[first_index];
+
+		uint32_t child_nrow = 
+			STARPU_MIN(chunk_size, nrow - chunk*chunk_size);
+
+		uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_csr_interface_t *local = &root_data->children[chunk].interface[node].csr;
+
+			local->nnz = local_nnz;
+			local->nrow = child_nrow;
+			local->firstentry = local_firstentry;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				local->rowptr = &root_data->interface[node].csr.rowptr[first_index];
+				local->colind = &root_data->interface[node].csr.colind[local_firstentry];
+				float *nzval = (float *)(root_data->interface[node].csr.nzval);
+				local->nzval = (uintptr_t)&nzval[local_firstentry];
+			}
+		}
+	}
+
+	return nchunks;
+}

+ 22 - 0
src/datawizard/interfaces/csr_filters.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __CSR_FILTERS_H__
+#define __CSR_FILTERS_H__
+
+#include <datawizard/hierarchy.h>
+
+#endif // __CSR_FILTERS_H__

+ 451 - 0
src/datawizard/interfaces/csr_interface.c

@@ -0,0 +1,451 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/data_parameters.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/hierarchy.h>
+
+#include <common/hash.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
+void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
+size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer);
+int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+size_t csr_interface_get_size(struct starpu_data_state_t *state);
+uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate);
+
+struct data_interface_ops_t interface_csr_ops = {
+	.allocate_data_on_node = allocate_csr_buffer_on_node,
+	.liberate_data_on_node = liberate_csr_buffer_on_node,
+	.copy_data_1_to_1 = do_copy_csr_buffer_1_to_1,
+	.dump_data_interface = dump_csr_interface,
+	.get_size = csr_interface_get_size,
+	.interfaceid = CSR_INTERFACE,
+	.footprint = footprint_csr_interface_crc32
+};
+
+/* declare a new data with the BLAS interface */
+void starpu_monitor_csr_data(struct starpu_data_state_t **handle, uint32_t home_node,
+		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
+{
+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
+	STARPU_ASSERT(state);
+
+	STARPU_ASSERT(handle);
+	*handle = state;
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_csr_interface_t *local_interface = &state->interface[node].csr;
+
+		if (node == home_node) {
+			local_interface->nzval = nzval;
+			local_interface->colind = colind;
+			local_interface->rowptr = rowptr;
+		}
+		else {
+			local_interface->nzval = 0;
+			local_interface->colind = NULL;
+			local_interface->rowptr = NULL;
+		}
+
+		local_interface->nnz = nnz;
+		local_interface->nrow = nrow;
+		local_interface->firstentry = firstentry;
+		local_interface->elemsize = elemsize;
+
+	}
+
+	state->ops = &interface_csr_ops;
+
+	monitor_new_data(state, home_node, 0);
+}
+
+static inline uint32_t footprint_csr_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
+{
+	uint32_t hash;
+
+	hash = hstate;
+	hash = hash_func(starpu_get_csr_nnz(state), hash);
+
+	return hash;
+}
+
+uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate)
+{
+	return footprint_csr_interface_generic(crc32_be, state, hstate);
+}
+
+
+
+struct dumped_csr_interface_s {
+	uint32_t nnz;
+	uint32_t nrow;
+	uintptr_t nzval;
+	uint32_t *colind;
+	uint32_t *rowptr;
+	uint32_t firstentry;
+	uint32_t elemsize;
+}  __attribute__ ((packed));
+
+size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer)
+{
+	/* yes, that's DIRTY ... */
+	struct dumped_csr_interface_s *buffer = _buffer;
+
+	buffer->nnz = (*interface).csr.nnz;
+	buffer->nrow = (*interface).csr.nrow;
+	buffer->nzval = (*interface).csr.nzval;
+	buffer->colind = (*interface).csr.colind;
+	buffer->rowptr = (*interface).csr.rowptr;
+	buffer->firstentry = (*interface).csr.firstentry;
+	buffer->elemsize = (*interface).csr.elemsize;
+
+	return (sizeof(struct dumped_csr_interface_s));
+}
+
+/* offer an access to the data parameters */
+uint32_t starpu_get_csr_nnz(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].csr.nnz);
+}
+
+uint32_t starpu_get_csr_nrow(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].csr.nrow);
+}
+
+uint32_t starpu_get_csr_firstentry(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].csr.firstentry);
+}
+
+size_t starpu_get_csr_elemsize(struct starpu_data_state_t *state)
+{
+	return (state->interface[0].csr.elemsize);
+}
+
+uintptr_t starpu_get_csr_local_nzval(struct starpu_data_state_t *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].csr.nzval);
+}
+
+uint32_t *starpu_get_csr_local_colind(struct starpu_data_state_t *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].csr.colind);
+}
+
+uint32_t *starpu_get_csr_local_rowptr(struct starpu_data_state_t *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].csr.rowptr);
+}
+
+size_t csr_interface_get_size(struct starpu_data_state_t *state)
+{
+	size_t size;
+
+	uint32_t nnz = starpu_get_csr_nnz(state);
+	uint32_t nrow = starpu_get_csr_nrow(state);
+	size_t elemsize = starpu_get_csr_elemsize(state);
+
+	size = nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
+
+	return size;
+}
+
+/* memory allocation/deallocation primitives for the BLAS interface */
+
+/* returns the size of the allocated area */
+size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node)
+{
+	uintptr_t addr_nzval;
+	uint32_t *addr_colind, *addr_rowptr;
+	size_t allocated_memory;
+
+	/* we need the 3 arrays to be allocated */
+
+	uint32_t nnz = state->interface[dst_node].csr.nnz;
+	uint32_t nrow = state->interface[dst_node].csr.nrow;
+	size_t elemsize = state->interface[dst_node].csr.elemsize;
+
+	node_kind kind = get_node_kind(dst_node);
+
+	switch(kind) {
+		case RAM:
+			addr_nzval = (uintptr_t)malloc(nnz*elemsize);
+			if (!addr_nzval)
+				goto fail_nzval;
+
+			addr_colind = malloc(nnz*sizeof(uint32_t));
+			if (!addr_colind)
+				goto fail_colind;
+
+			addr_rowptr = malloc((nrow+1)*sizeof(uint32_t));
+			if (!addr_rowptr)
+				goto fail_rowptr;
+
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasAlloc(nnz, elemsize, (void **)&addr_nzval);
+			if (!addr_nzval)
+				goto fail_nzval;
+
+			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
+			if (!addr_colind)
+				goto fail_colind;
+
+			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
+			if (!addr_rowptr)
+				goto fail_rowptr;
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+	/* allocation succeeded */
+	allocated_memory = 
+		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
+
+	/* update the data properly in consequence */
+	state->interface[dst_node].csr.nzval = addr_nzval;
+	state->interface[dst_node].csr.colind = addr_colind;
+	state->interface[dst_node].csr.rowptr = addr_rowptr;
+	
+	return allocated_memory;
+
+fail_rowptr:
+	switch(kind) {
+		case RAM:
+			free((void *)addr_colind);
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)addr_colind);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+fail_colind:
+	switch(kind) {
+		case RAM:
+			free((void *)addr_nzval);
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)addr_nzval);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+fail_nzval:
+
+	/* allocation failed */
+	allocated_memory = 0;
+
+	return allocated_memory;
+}
+
+void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
+{
+	node_kind kind = get_node_kind(node);
+	switch(kind) {
+		case RAM:
+			free((void*)interface->csr.nzval);
+			free((void*)interface->csr.colind);
+			free((void*)interface->csr.rowptr);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)interface->csr.nzval);
+			cublasFree((void*)interface->csr.colind);
+			cublasFree((void*)interface->csr.rowptr);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+#ifdef USE_CUDA
+static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = &state->interface[src_node].csr;
+	dst_csr = &state->interface[dst_node].csr;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cublasGetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
+					(uint8_t *)dst_csr->nzval, 1);
+
+	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
+						(uint8_t *)dst_csr->colind, 1);
+
+	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
+						(uint8_t *)dst_csr->rowptr, 1);
+	
+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+}
+
+static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = &state->interface[src_node].csr;
+	dst_csr = &state->interface[dst_node].csr;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cublasSetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
+					(uint8_t *)dst_csr->nzval, 1);
+
+	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
+						(uint8_t *)dst_csr->colind, 1);
+
+	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
+						(uint8_t *)dst_csr->rowptr, 1);
+	
+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+}
+#endif // USE_CUDA
+
+/* as not all platform easily have a BLAS lib installed ... */
+static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+
+	starpu_csr_interface_t *src_csr;
+	starpu_csr_interface_t *dst_csr;
+
+	src_csr = &state->interface[src_node].csr;
+	dst_csr = &state->interface[dst_node].csr;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
+
+	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
+
+	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
+
+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+}
+
+
+int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+{
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				if (get_local_memory_node() == src_node)
+				{
+					copy_cublas_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					post_data_request(state, src_node, dst_node);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_ram_to_cublas(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}

+ 26 - 0
src/datawizard/interfaces/csr_interface.h

@@ -0,0 +1,26 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __CSR_INTERFACE_H__
+#define __CSR_INTERFACE_H__
+
+#include <stdint.h>
+
+/* this interface is used for Sparse matrices */
+
+#define CSR_INTERFACE	0x118502
+
+#endif // __CSR_INTERFACE_H__

+ 57 - 0
src/datawizard/interfaces/data_interface.h

@@ -0,0 +1,57 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DATA_INTERFACE_H__
+#define __DATA_INTERFACE_H__
+
+#include <stdio.h>
+
+#include <datawizard/data_parameters.h>
+#include "blas_interface.h"
+#include "vector_interface.h"
+#include "csr_interface.h"
+#include "csc_interface.h"
+#include "bcsr_interface.h"
+
+#ifdef USE_GORDON
+/* to get the gordon_strideSize_t data structure from gordon */
+#include <cell/gordon/gordon.h>
+#endif
+
+#include <starpu.h>
+
+struct starpu_data_state_t;
+
+struct data_interface_ops_t {
+	size_t (*allocate_data_on_node)(struct starpu_data_state_t *state,
+					uint32_t node);
+	void (*liberate_data_on_node)(starpu_data_interface_t *interface,
+					uint32_t node);
+	int (*copy_data_1_to_1)(struct starpu_data_state_t *state, 
+					uint32_t src, uint32_t dst);
+	size_t (*dump_data_interface)(starpu_data_interface_t *interface, 
+					void *buffer);
+	size_t (*get_size)(struct starpu_data_state_t *state);
+	uint32_t (*footprint)(struct starpu_data_state_t *state, uint32_t hstate);
+	void (*display)(struct starpu_data_state_t *state, FILE *f);
+#ifdef USE_GORDON
+	int (*convert_to_gordon)(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+#endif
+	/* an identifier that is unique to each interface */
+	uint32_t interfaceid;
+};
+
+#endif // __DATA_INTERFACE_H__

+ 146 - 0
src/datawizard/interfaces/vector_filters.c

@@ -0,0 +1,146 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "vector_filters.h"
+#include "vector_interface.h"
+
+unsigned starpu_block_filter_func_vector(starpu_filter *f, data_state *root_data)
+{
+	unsigned nchunks;
+	uint32_t arg = f->filter_arg;
+
+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
+	uint32_t nx = vector_root->nx;
+	size_t elemsize = vector_root->elemsize;
+
+	/* we will have arg chunks */
+	nchunks = STARPU_MIN(nx, arg);
+
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	/* actually create all the chunks */
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
+		size_t offset = chunk*chunk_size*elemsize;
+
+		uint32_t child_nx = 
+			STARPU_MIN(chunk_size, nx - chunk*chunk_size);
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_vector_interface_t *local = &root_data->children[chunk].interface[node].vector;
+
+			local->nx = child_nx;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				local->ptr = root_data->interface[node].vector.ptr + offset;
+			}
+		}
+	}
+
+	return nchunks;
+}
+
+
+unsigned starpu_divide_in_2_filter_func_vector(starpu_filter *f, data_state *root_data)
+{
+	uint32_t length_first = f->filter_arg;
+
+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
+	uint32_t nx = vector_root->nx;
+	size_t elemsize = vector_root->elemsize;
+
+	/* first allocate the children data_state */
+	root_data->children = calloc(2, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	STARPU_ASSERT(length_first < nx);
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_vector_interface_t *local = &root_data->children[0].interface[node].vector;
+
+		local->nx = length_first;
+		local->elemsize = elemsize;
+
+		if (root_data->per_node[node].allocated) {
+			local->ptr = root_data->interface[node].vector.ptr;
+		}
+	}
+
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_vector_interface_t *local = &root_data->children[1].interface[node].vector;
+
+		local->nx = nx - length_first;
+		local->elemsize = elemsize;
+
+		if (root_data->per_node[node].allocated) {
+			local->ptr = root_data->interface[node].vector.ptr + length_first*elemsize;
+		}
+	}
+
+	return 2;
+}
+
+unsigned starpu_list_filter_func_vector(starpu_filter *f, data_state *root_data)
+{
+	uint32_t nchunks = f->filter_arg;
+	uint32_t *length_tab = f->filter_arg_ptr;
+
+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
+	uint32_t nx = vector_root->nx;
+	size_t elemsize = vector_root->elemsize;
+
+	/* first allocate the children data_state */
+	root_data->children = calloc(nchunks, sizeof(data_state));
+	STARPU_ASSERT(root_data->children);
+
+	unsigned current_pos = 0;
+
+	/* actually create all the chunks */
+	unsigned chunk;
+	for (chunk = 0; chunk < nchunks; chunk++)
+	{
+		uint32_t chunk_size = length_tab[chunk];
+
+		unsigned node;
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_vector_interface_t *local = &root_data->children[chunk].interface[node].vector;
+
+			local->nx = chunk_size;
+			local->elemsize = elemsize;
+
+			if (root_data->per_node[node].allocated) {
+				local->ptr = root_data->interface[node].vector.ptr + current_pos*elemsize;
+			}
+		}
+
+		current_pos += chunk_size;
+	}
+
+	STARPU_ASSERT(current_pos == nx);
+
+	return nchunks;
+}

+ 22 - 0
src/datawizard/interfaces/vector_filters.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __VECTOR_FILTERS_H__
+#define __VECTOR_FILTERS_H__
+
+#include <datawizard/hierarchy.h>
+
+#endif // __VECTOR_FILTERS_H__

+ 340 - 0
src/datawizard/interfaces/vector_interface.c

@@ -0,0 +1,340 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/data_parameters.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/hierarchy.h>
+
+#include <common/hash.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node);
+void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
+int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
+size_t dump_vector_interface(starpu_data_interface_t *interface, void *buffer);
+size_t vector_interface_get_size(struct starpu_data_state_t *state);
+uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate);
+void display_vector_interface(data_state *state, FILE *f);
+#ifdef USE_GORDON
+int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
+#endif
+
+struct data_interface_ops_t interface_vector_ops = {
+	.allocate_data_on_node = allocate_vector_buffer_on_node,
+	.liberate_data_on_node = liberate_vector_buffer_on_node,
+	.copy_data_1_to_1 = do_copy_vector_buffer_1_to_1,
+	.dump_data_interface = dump_vector_interface,
+	.get_size = vector_interface_get_size,
+	.footprint = footprint_vector_interface_crc32,
+#ifdef USE_GORDON
+	.convert_to_gordon = convert_vector_to_gordon,
+#endif
+	.interfaceid = VECTOR_INTERFACE,
+	.display = display_vector_interface
+};
+
+#ifdef USE_GORDON
+int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+{
+	STARPU_ASSERT(gordon_interface);
+
+	*ptr = (*interface).vector.ptr;
+	(*ss).size = (*interface).vector.nx * (*interface).vector.elemsize;
+
+	return 0;
+}
+#endif
+
+/* declare a new data with the BLAS interface */
+void starpu_monitor_vector_data(struct starpu_data_state_t **handle, uint32_t home_node,
+                        uintptr_t ptr, uint32_t nx, size_t elemsize)
+{
+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
+	STARPU_ASSERT(state);
+
+	STARPU_ASSERT(handle);
+	*handle = state;
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_vector_interface_t *local_interface = &state->interface[node].vector;
+
+		if (node == home_node) {
+			local_interface->ptr = ptr;
+		}
+		else {
+			local_interface->ptr = 0;
+		}
+
+		local_interface->nx = nx;
+		local_interface->elemsize = elemsize;
+	}
+
+	state->ops = &interface_vector_ops;
+
+	monitor_new_data(state, home_node, 0);
+}
+
+
+static inline uint32_t footprint_vector_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
+{
+	uint32_t hash;
+
+	hash = hstate;
+	hash = hash_func(starpu_get_vector_nx(state), hash);
+
+	return hash;
+}
+
+uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate)
+{
+	return footprint_vector_interface_generic(crc32_be, state, hstate);
+}
+
+struct dumped_vector_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	uint32_t elemsize;
+} __attribute__ ((packed));
+
+void display_vector_interface(data_state *state, FILE *f)
+{
+	starpu_vector_interface_t *interface;
+	interface =  &state->interface[0].vector;
+
+	fprintf(f, "%d\t", interface->nx);
+}
+
+
+size_t dump_vector_interface(starpu_data_interface_t *interface, void *_buffer)
+{
+	/* yes, that's DIRTY ... */
+	struct dumped_vector_interface_s *buffer = _buffer;
+
+	buffer->ptr = (*interface).vector.ptr;
+	buffer->nx = (*interface).vector.nx;
+	buffer->elemsize = (*interface).vector.elemsize;
+
+	return (sizeof(struct dumped_vector_interface_s));
+}
+
+size_t vector_interface_get_size(struct starpu_data_state_t *state)
+{
+	size_t size;
+	starpu_vector_interface_t *interface;
+
+	interface =  &state->interface[0].vector;
+
+	size = interface->nx*interface->elemsize;
+
+	return size;
+}
+
+/* offer an access to the data parameters */
+uint32_t starpu_get_vector_nx(data_state *state)
+{
+	return (state->interface[0].vector.nx);
+}
+
+uintptr_t starpu_get_vector_local_ptr(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].vector.ptr);
+}
+
+/* memory allocation/deallocation primitives for the vector interface */
+
+/* returns the size of the allocated area */
+size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node)
+{
+	uintptr_t addr = 0;
+	size_t allocated_memory;
+
+	uint32_t nx = state->interface[dst_node].vector.nx;
+	size_t elemsize = state->interface[dst_node].vector.elemsize;
+
+	node_kind kind = get_node_kind(dst_node);
+
+	switch(kind) {
+		case RAM:
+			addr = (uintptr_t)malloc(nx*elemsize);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasAlloc(nx, elemsize, (void **)&addr);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+	if (addr) {
+		/* allocation succeeded */
+		allocated_memory = nx*elemsize;
+
+		/* update the data properly in consequence */
+		state->interface[dst_node].vector.ptr = addr;
+	} else {
+		/* allocation failed */
+		allocated_memory = 0;
+	}
+	
+	return allocated_memory;
+}
+
+void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
+{
+	node_kind kind = get_node_kind(node);
+	switch(kind) {
+		case RAM:
+			free((void*)interface->vector.ptr);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			cublasFree((void*)interface->vector.ptr);
+			break;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+#ifdef USE_CUDA
+static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = &state->interface[src_node].vector;
+	dst_vector = &state->interface[dst_node].vector;
+
+	cublasGetVector(src_vector->nx, src_vector->elemsize,
+		(uint8_t *)src_vector->ptr, 1,
+		(uint8_t *)dst_vector->ptr, 1);
+
+	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+}
+
+static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_vector_interface_t *src_vector;
+	starpu_vector_interface_t *dst_vector;
+
+	src_vector = &state->interface[src_node].vector;
+	dst_vector = &state->interface[dst_node].vector;
+
+	cublasSetVector(src_vector->nx, src_vector->elemsize,
+		(uint8_t *)src_vector->ptr, 1,
+		(uint8_t *)dst_vector->ptr, 1);
+
+	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+}
+#endif // USE_CUDA
+
+static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	uint32_t nx = state->interface[dst_node].vector.nx;
+	size_t elemsize = state->interface[dst_node].vector.elemsize;
+
+	uintptr_t ptr_src = state->interface[src_node].vector.ptr;
+	uintptr_t ptr_dst = state->interface[dst_node].vector.ptr;
+
+	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
+
+	TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
+}
+
+int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				if (get_local_memory_node() == src_node)
+				{
+					/* only the proper CUBLAS thread can initiate this directly ! */
+					copy_cublas_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					/* put a request to the corresponding GPU */
+					post_data_request(state, src_node, dst_node);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_ram_to_cublas(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+

+ 24 - 0
src/datawizard/interfaces/vector_interface.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __VECTOR_INTERFACE_H__
+#define __VECTOR_INTERFACE_H__
+
+#include <stdint.h>
+
+#define VECTOR_INTERFACE   0x118503
+
+#endif // __VECTOR_INTERFACE_H__

+ 537 - 0
src/datawizard/memalloc.c

@@ -0,0 +1,537 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "memalloc.h"
+#include <datawizard/footprint.h>
+
+extern mem_node_descr descr;
+static starpu_mutex mc_mutex[MAXNODES]; 
+static mem_chunk_list_t mc_list[MAXNODES];
+static mem_chunk_list_t mc_list_to_free[MAXNODES];
+
+void init_mem_chunk_lists(void)
+{
+	unsigned i;
+	for (i = 0; i < MAXNODES; i++)
+	{
+		init_mutex(&mc_mutex[i]);
+		mc_list[i] = mem_chunk_list_new();
+		mc_list_to_free[i] = mem_chunk_list_new();
+	}
+}
+
+static void lock_all_subtree(data_state *data)
+{
+	if (data->nchildren == 0)
+	{
+		/* this is a leaf */	
+		while (take_mutex_try(&data->header_lock))
+			datawizard_progress(get_local_memory_node());
+	}
+	else {
+		/* lock all sub-subtrees children */
+		int child;
+		for (child = 0; child < data->nchildren; child++)
+		{
+			lock_all_subtree(&data->children[child]);
+		}
+	}
+}
+
+static void unlock_all_subtree(data_state *data)
+{
+	if (data->nchildren == 0)
+	{
+		/* this is a leaf */	
+		release_mutex(&data->header_lock);
+	}
+	else {
+		/* lock all sub-subtrees children */
+		int child;
+		for (child = data->nchildren - 1; child >= 0; child--)
+		{
+			unlock_all_subtree(&data->children[child]);
+		}
+	}
+}
+
+unsigned may_free_subtree(data_state *data, unsigned node)
+{
+	if (data->nchildren == 0)
+	{
+		/* we only free if no one refers to the leaf */
+		uint32_t refcnt = get_data_refcnt(data, node);
+		return (refcnt == 0);
+	}
+	else {
+		/* lock all sub-subtrees children */
+		int child;
+		for (child = 0; child < data->nchildren; child++)
+		{
+			unsigned res;
+			res = may_free_subtree(&data->children[child], node);
+			if (!res) return 0;
+		}
+
+		/* no problem was found */
+		return 1;
+	}
+}
+
+size_t do_free_mem_chunk(mem_chunk_t mc, unsigned node)
+{
+	size_t size;
+
+	/* free the actual buffer */
+	size = liberate_memory_on_node(mc, node);
+
+	/* remove the mem_chunk from the list */
+	mem_chunk_list_erase(mc_list[node], mc);
+	mem_chunk_delete(mc);
+
+	return size; 
+}
+
+void transfer_subtree_to_node(data_state *data, unsigned src_node, 
+						unsigned dst_node)
+{
+	unsigned i;
+	unsigned last = 0;
+	unsigned cnt;
+	int ret;
+
+	if (data->nchildren == 0)
+	{
+		/* this is a leaf */
+		switch(data->per_node[src_node].state) {
+		case OWNER:
+			/* the local node has the only copy */
+			/* the owner is now the destination_node */
+			data->per_node[src_node].state = INVALID;
+			data->per_node[dst_node].state = OWNER;
+
+			ret = driver_copy_data_1_to_1(data, src_node, dst_node, 0);
+			STARPU_ASSERT(ret == 0);
+
+			break;
+		case SHARED:
+			/* some other node may have the copy */
+			data->per_node[src_node].state = INVALID;
+
+			/* count the number of copies */
+			cnt = 0;
+			for (i = 0; i < MAXNODES; i++)
+			{
+				if (data->per_node[i].state == SHARED) {
+					cnt++; 
+					last = i;
+				}
+			}
+
+			if (cnt == 1)
+				data->per_node[last].state = OWNER;
+
+			break;
+		case INVALID:
+			/* nothing to be done */
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+		}
+	}
+	else {
+		/* lock all sub-subtrees children */
+		int child;
+		for (child = 0; child < data->nchildren; child++)
+		{
+			transfer_subtree_to_node(&data->children[child],
+							src_node, dst_node);
+		}
+	}
+}
+
+
+static size_t try_to_free_mem_chunk(mem_chunk_t mc, unsigned node, unsigned attempts)
+{
+	size_t liberated = 0;
+
+	data_state *data;
+
+	data = mc->data;
+
+	STARPU_ASSERT(data);
+
+	if (attempts == 0)
+	{
+		/* this is the first attempt to free memory
+		   so we avoid to drop requested memory */
+		/* TODO */
+	}
+
+	/* try to lock all the leafs of the subtree */
+	lock_all_subtree(data);
+
+	/* check if they are all "free" */
+	if (may_free_subtree(data, node))
+	{
+		/* in case there was nobody using that buffer, throw it 
+		 * away after writing it back to main memory */
+		transfer_subtree_to_node(data, node, 0);
+
+		/* now the actual buffer may be liberated */
+		liberated = do_free_mem_chunk(mc, node);
+	}
+
+	/* unlock the leafs */
+	unlock_all_subtree(data);
+
+	return liberated;
+}
+
+#ifdef USE_ALLOCATION_CACHE
+/* we assume that mc_mutex[node] is taken */
+static void reuse_mem_chunk(unsigned node, data_state *new_data, mem_chunk_t mc, unsigned is_already_in_mc_list)
+{
+	data_state *old_data;
+	old_data = mc->data;
+
+	/* we found an appropriate mem chunk: so we get it out
+	 * of the "to free" list, and reassign it to the new
+	 * piece of data */
+
+	if (!is_already_in_mc_list)
+	{
+		mem_chunk_list_erase(mc_list_to_free[node], mc);
+	}
+
+	if (!mc->data_was_deleted)
+	{
+		old_data->per_node[node].allocated = 0;
+		old_data->per_node[node].automatically_allocated = 0;
+	}
+
+	new_data->per_node[node].allocated = 1;
+	new_data->per_node[node].automatically_allocated = 1;
+
+	memcpy(&new_data->interface[node], &mc->interface, sizeof(starpu_data_interface_t));
+
+	mc->data = new_data;
+	mc->data_was_deleted = 0;
+	/* mc->ops, mc->size, mc->footprint and mc->interface should be
+ 	 * unchanged ! */
+	
+	/* reinsert the mem chunk in the list of active memory chunks */
+	if (!is_already_in_mc_list)
+	{
+		mem_chunk_list_push_front(mc_list[node], mc);
+	}
+}
+
+
+
+static unsigned try_to_reuse_mem_chunk(mem_chunk_t mc, unsigned node, data_state *new_data, unsigned is_already_in_mc_list)
+{
+	unsigned success = 0;
+
+	data_state *old_data;
+
+	old_data = mc->data;
+
+	STARPU_ASSERT(old_data);
+
+	/* try to lock all the leafs of the subtree */
+	lock_all_subtree(old_data);
+
+	/* check if they are all "free" */
+	if (may_free_subtree(old_data, node))
+	{
+		success = 1;
+
+		/* in case there was nobody using that buffer, throw it 
+		 * away after writing it back to main memory */
+		transfer_subtree_to_node(old_data, node, 0);
+
+		/* now replace the previous data */
+		reuse_mem_chunk(node, new_data, mc, is_already_in_mc_list);
+	}
+
+	/* unlock the leafs */
+	unlock_all_subtree(old_data);
+
+	return success;
+}
+
+/* this function looks for a memory chunk that matches a given footprint in the
+ * list of mem chunk that need to be liberated */
+static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data, uint32_t footprint)
+{
+	take_mutex(&mc_mutex[node]);
+
+	/* go through all buffers for which there was a removal request */
+	mem_chunk_t mc, next_mc;
+	for (mc = mem_chunk_list_begin(mc_list_to_free[node]);
+	     mc != mem_chunk_list_end(mc_list_to_free[node]);
+	     mc = next_mc)
+	{
+		next_mc = mem_chunk_list_next(mc);
+
+		if (mc->footprint == footprint)
+		{
+
+			data_state *old_data;
+			old_data = mc->data;
+
+			if (old_data->per_node[node].allocated &&
+					old_data->per_node[node].automatically_allocated)
+			{
+				reuse_mem_chunk(node, data, mc, 0);
+
+				release_mutex(&mc_mutex[node]);
+				return 1;
+			}
+		}
+
+	}
+
+	/* now look for some non essential data in the active list */
+	for (mc = mem_chunk_list_begin(mc_list[node]);
+	     mc != mem_chunk_list_end(mc_list[node]);
+	     mc = next_mc)
+	{
+		/* there is a risk that the memory chunk is liberated 
+		   before next iteration starts: so we compute the next
+		   element of the list now */
+		next_mc = mem_chunk_list_next(mc);
+
+		if (mc->data->is_not_important && (mc->footprint == footprint))
+		{
+//			fprintf(stderr, "found a candidate ...\n");
+			if (try_to_reuse_mem_chunk(mc, node, data, 1))
+			{
+				release_mutex(&mc_mutex[node]);
+				return 1;
+			}
+		}
+	}
+
+	release_mutex(&mc_mutex[node]);
+
+	return 0;
+}
+#endif
+
+/* 
+ * Try to free some memory on the specified node
+ * 	returns 0 if no memory was released, 1 else
+ */
+static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)), unsigned attempts)
+{
+//	fprintf(stderr, "reclaim memory...\n");
+
+	size_t liberated = 0;
+
+	take_mutex(&mc_mutex[node]);
+
+	/* remove all buffers for which there was a removal request */
+	mem_chunk_t mc, next_mc;
+	for (mc = mem_chunk_list_begin(mc_list_to_free[node]);
+	     mc != mem_chunk_list_end(mc_list_to_free[node]);
+	     mc = next_mc)
+	{
+		next_mc = mem_chunk_list_next(mc);
+
+		liberated += liberate_memory_on_node(mc, node);
+
+		mem_chunk_list_erase(mc_list_to_free[node], mc);
+
+		mem_chunk_delete(mc);
+	}
+
+	/* try to free all allocated data potentially in use .. XXX */
+	for (mc = mem_chunk_list_begin(mc_list[node]);
+	     mc != mem_chunk_list_end(mc_list[node]);
+	     mc = next_mc)
+	{
+		/* there is a risk that the memory chunk is liberated 
+		   before next iteration starts: so we compute the next
+		   element of the list now */
+		next_mc = mem_chunk_list_next(mc);
+
+		liberated += try_to_free_mem_chunk(mc, node, attempts);
+		#if 0
+		if (liberated > toreclaim)
+			break;
+		#endif
+	}
+
+//	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
+
+	release_mutex(&mc_mutex[node]);
+
+	return liberated;
+}
+
+static void register_mem_chunk(data_state *state, uint32_t dst_node, size_t size, unsigned automatically_allocated)
+{
+	mem_chunk_t mc = mem_chunk_new();
+
+	STARPU_ASSERT(state);
+	STARPU_ASSERT(state->ops);
+
+	mc->data = state;
+	mc->size = size; 
+	mc->footprint = compute_data_footprint(state);
+	mc->ops = state->ops;
+	mc->data_was_deleted = 0;
+	mc->automatically_allocated = automatically_allocated;
+
+	/* the interface was already filled by ops->allocate_data_on_node */
+	memcpy(&mc->interface, &state->interface[dst_node], sizeof(starpu_data_interface_t));
+
+	take_mutex(&mc_mutex[dst_node]);
+	mem_chunk_list_push_front(mc_list[dst_node], mc);
+	release_mutex(&mc_mutex[dst_node]);
+}
+
+void request_mem_chunk_removal(data_state *state, unsigned node)
+{
+	take_mutex(&mc_mutex[node]);
+
+	/* iterate over the list of memory chunks and remove the entry */
+	mem_chunk_t mc, next_mc;
+	for (mc = mem_chunk_list_begin(mc_list[node]);
+	     mc != mem_chunk_list_end(mc_list[node]);
+	     mc = next_mc)
+	{
+		next_mc = mem_chunk_list_next(mc);
+
+		if (mc->data == state) {
+			/* we found the data */
+			mc->data_was_deleted = 1;
+
+			/* remove it from the main list */
+			mem_chunk_list_erase(mc_list[node], mc);
+
+			/* put it in the list of buffers to be removed */
+			mem_chunk_list_push_front(mc_list_to_free[node], mc);
+
+			release_mutex(&mc_mutex[node]);
+
+			return;
+		}
+	}
+
+	/* there was no corresponding buffer ... */
+
+	release_mutex(&mc_mutex[node]);
+}
+
+size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node)
+{
+	size_t liberated = 0;
+
+	STARPU_ASSERT(mc->ops);
+	STARPU_ASSERT(mc->ops->liberate_data_on_node);
+
+	if (mc->automatically_allocated)
+	{
+		mc->ops->liberate_data_on_node(&mc->interface, node);
+
+		if (!mc->data_was_deleted)
+		{
+			data_state *state = mc->data;
+
+			state->per_node[node].allocated = 0;
+
+			/* XXX why do we need that ? */
+			state->per_node[node].automatically_allocated = 0;
+		}
+
+		liberated = mc->size;
+	}
+
+	return liberated;
+}
+
+/*
+ * In order to allocate a piece of data, we try to reuse existing buffers if
+ * its possible.
+ *	1 - we try to reuse a memchunk that is explicitely unused.
+ *	2 - we go through the list of memory chunks and find one that is not
+ *	referenced and that has the same footprint to reuse it.
+ *	3 - we call the usual driver's alloc method
+ *	4 - we go through the list of memory chunks and release those that are
+ *	not referenced (or part of those).
+ *
+ */
+int allocate_memory_on_node(data_state *state, uint32_t dst_node)
+{
+	unsigned attempts = 0;
+	size_t allocated_memory;
+
+	STARPU_ASSERT(state);
+
+	data_allocation_inc_stats(dst_node);
+
+#ifdef USE_ALLOCATION_CACHE
+	/* perhaps we can directly reuse a buffer in the free-list */
+	uint32_t footprint = compute_data_footprint(state);
+
+	TRACE_START_ALLOC_REUSE(dst_node);
+	if (try_to_find_reusable_mem_chunk(dst_node, state, footprint))
+	{
+		allocation_cache_hit(dst_node);
+		return 0;
+	}
+	TRACE_END_ALLOC_REUSE(dst_node);
+#endif
+
+	do {
+		STARPU_ASSERT(state->ops);
+		STARPU_ASSERT(state->ops->allocate_data_on_node);
+
+		TRACE_START_ALLOC(dst_node);
+		allocated_memory = state->ops->allocate_data_on_node(state, dst_node);
+		TRACE_END_ALLOC(dst_node);
+
+		if (!allocated_memory) {
+			/* XXX perhaps we should find the proper granularity 
+			 * not to waste our cache all the time */
+			STARPU_ASSERT(state->ops->get_size);
+			size_t data_size = state->ops->get_size(state);
+
+			TRACE_START_MEMRECLAIM(dst_node);
+			reclaim_memory(dst_node, 2*data_size, attempts);
+			TRACE_END_MEMRECLAIM(dst_node);
+		}
+		
+	} while(!allocated_memory && attempts++ < 2);
+
+	/* perhaps we could really not handle that capacity misses */
+	if (!allocated_memory)
+		goto nomem;
+
+	register_mem_chunk(state, dst_node, allocated_memory, 1);
+
+	state->per_node[dst_node].allocated = 1;
+	state->per_node[dst_node].automatically_allocated = 1;
+
+	return 0;
+nomem:
+	STARPU_ASSERT(!allocated_memory);
+	return -ENOMEM;
+}

+ 51 - 0
src/datawizard/memalloc.h

@@ -0,0 +1,51 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MEMALLOC_H__
+#define __MEMALLOC_H__
+
+#include <common/list.h>
+#include <datawizard/interfaces/data_interface.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/progress.h>
+
+struct starpu_data_state_t;
+
+LIST_TYPE(mem_chunk,
+	struct starpu_data_state_t *data;
+	size_t size;
+
+	uint32_t footprint;
+	
+	/* The footprint of the data is not sufficient to determine whether two
+	 * pieces of data have the same layout (there could be collision in the
+	 * hash function ...) so we still keep a copy of the actual layout (ie.
+	 * the starpu_data_interface_t) to stay on the safe side. We make a copy of
+	 * because when a data is deleted, the memory chunk remains.
+	 */
+	struct data_interface_ops_t *ops;
+	starpu_data_interface_t interface;
+	unsigned automatically_allocated;
+	unsigned data_was_deleted;
+);
+
+void init_mem_chunk_lists(void);
+void request_mem_chunk_removal(struct starpu_data_state_t *state, unsigned node);
+int allocate_memory_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
+size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node);
+
+#endif

+ 49 - 0
src/datawizard/progress.c

@@ -0,0 +1,49 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <pthread.h>
+#include <core/workers.h>
+#include <datawizard/progress.h>
+#include <datawizard/data_request.h>
+
+extern pthread_key_t local_workers_key;
+
+#ifdef USE_GORDON
+extern void handle_terminated_job_per_worker(struct worker_s *worker);
+extern struct starpu_mutex_t terminated_list_mutexes[32]; 
+#endif
+
+void datawizard_progress(uint32_t memory_node)
+{
+	/* in case some other driver requested data */
+	handle_node_data_requests(memory_node);
+
+#ifdef USE_GORDON
+	/* XXX quick and dirty !! */
+	struct worker_set_s *set;
+	set = pthread_getspecific(local_workers_key);
+	if (set) {
+		/* make the corresponding workers progress */
+		unsigned worker;
+		for (worker = 0; worker < set->nworkers; worker++)
+		{
+			take_mutex(&terminated_list_mutexes[0]);
+			handle_terminated_job_per_worker(&set->workers[worker]);
+			release_mutex(&terminated_list_mutexes[0]);
+		}
+	}
+#endif
+}

+ 24 - 0
src/datawizard/progress.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_PROGRESS_H__
+#define __DW_PROGRESS_H__
+
+#include <stdint.h>
+
+void datawizard_progress(uint32_t memory_node);
+
+#endif

+ 76 - 0
src/datawizard/write_back.c

@@ -0,0 +1,76 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <datawizard/write_back.h>
+#include <datawizard/coherency.h>
+
+void write_through_data(data_state *state, uint32_t requesting_node, 
+					   uint32_t write_through_mask)
+{
+	if ((write_through_mask & ~(1<<requesting_node)) == 0) {
+		/* nothing will be done ... */
+		return;
+	}
+
+	while (take_mutex_try(&state->header_lock))
+		datawizard_progress(requesting_node);
+
+	/* first commit all changes onto the nodes specified by the mask */
+	uint32_t node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		if (write_through_mask & (1<<node)) {
+			/* we need to commit the buffer on that node */
+			if (node != requesting_node) 
+			{
+				/* the requesting node already has the data by
+				 * definition */
+				int ret;
+				ret = driver_copy_data_1_to_1(state, 
+						requesting_node, node, 0);
+
+				/* there must remain memory on the write-through mask to honor the request */
+				if (ret)
+					STARPU_ASSERT(0);
+			}
+				
+			/* now the data is shared among the nodes on the
+			 * write_through_mask */
+			state->per_node[node].state = SHARED;
+		}
+	}
+
+	/* the requesting node is now one sharer */
+	if (write_through_mask & ~(1<<requesting_node))
+	{
+		state->per_node[requesting_node].state = SHARED;
+	}
+
+	release_mutex(&state->header_lock);
+}
+
+void data_set_wb_mask(data_state *data, uint32_t wb_mask)
+{
+	data->wb_mask = wb_mask;
+
+	/* in case the data has some children, set their wb_mask as well */
+	if (data->nchildren > 0) 
+	{
+		int child;
+		for (child = 0; child < data->nchildren; child++)
+			data_set_wb_mask(&data->children[child], wb_mask);
+	}
+}

+ 27 - 0
src/datawizard/write_back.h

@@ -0,0 +1,27 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_WRITE_BACK_H__
+#define __DW_WRITE_BACK_H__
+
+#include <datawizard/coherency.h>
+
+void write_through_data(data_state *state, uint32_t requesting_node, 
+					   uint32_t write_through_mask);
+void data_set_wb_mask(data_state *state, uint32_t wb_mask);
+
+
+#endif // __DW_WRITE_BACK_H__

+ 31 - 0
src/drivers/core/Makefile

@@ -0,0 +1,31 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+CFLAGS += -I../../
+
+OBJS := driver_core.o
+
+all: $(OBJS)
+
+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
+%.d: %.c
+	$(CC) $(CFLAGS) $< -MM -o $*.d
+
+-include $(OBJS:.o=.d)
+endif
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda 

+ 157 - 0
src/drivers/core/driver_core.c

@@ -0,0 +1,157 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "driver_core.h"
+#include <core/policies/sched_policy.h>
+
+int execute_job_on_core(job_t j, struct worker_s *core_args)
+{
+	int ret;
+	tick_t codelet_start, codelet_end;
+	tick_t codelet_start_comm, codelet_end_comm;
+
+	unsigned calibrate_model = 0;
+	struct starpu_task *task = j->task;
+
+	STARPU_ASSERT(task->cl);
+	STARPU_ASSERT(task->cl->core_func);
+
+	if (task->cl->model && task->cl->model->benchmarking)
+		calibrate_model = 1;
+
+	if (calibrate_model || BENCHMARK_COMM)
+		GET_TICK(codelet_start_comm);
+
+	ret = fetch_codelet_input(task->buffers, task->interface,
+			task->cl->nbuffers, 0);
+
+	if (calibrate_model || BENCHMARK_COMM)
+		GET_TICK(codelet_end_comm);
+
+	if (ret != 0) {
+		/* there was not enough memory so the codelet cannot be executed right now ... */
+		/* push the codelet back and try another one ... */
+		return STARPU_TRYAGAIN;
+	}
+
+	TRACE_START_CODELET_BODY(j);
+
+	if (calibrate_model || BENCHMARK_COMM)
+		GET_TICK(codelet_start);
+
+	cl_func func = task->cl->core_func;
+	func(task->interface, task->cl_arg);
+	
+	if (calibrate_model || BENCHMARK_COMM)
+		GET_TICK(codelet_end);
+
+	TRACE_END_CODELET_BODY(j);
+
+	push_codelet_output(task->buffers, task->cl->nbuffers, 0);
+
+//#ifdef MODEL_DEBUG
+	if (calibrate_model || BENCHMARK_COMM)
+	{
+		double measured = timing_delay(&codelet_start, &codelet_end);
+		double measured_comm = timing_delay(&codelet_start_comm, &codelet_end_comm);
+
+//		fprintf(stderr, "%d\t%d\n", (int)j->penality, (int)measured_comm);
+		core_args->jobq->total_computation_time += measured;
+		core_args->jobq->total_communication_time += measured_comm;
+
+		if (calibrate_model)
+			update_perfmodel_history(j, core_args->arch, measured);
+	}
+//#endif
+
+	return STARPU_SUCCESS;
+}
+
+void *core_worker(void *arg)
+{
+	struct worker_s *core_arg = arg;
+
+#ifdef USE_FXT
+	fxt_register_thread(core_arg->bindid);
+#endif
+	TRACE_NEW_WORKER(FUT_CORE_KEY, core_arg->memory_node);
+
+#ifndef DONTBIND
+	/* fix the thread on the correct cpu */
+	cpu_set_t aff_mask; 
+	CPU_ZERO(&aff_mask);
+	CPU_SET(core_arg->bindid, &aff_mask);
+	sched_setaffinity(0, sizeof(aff_mask), &aff_mask);
+#endif
+
+#ifdef VERBOSE
+        fprintf(stderr, "core worker %d is ready on logical core %d\n", core_arg->id, core_arg->bindid);
+#endif
+
+	set_local_memory_node_key(&core_arg->memory_node);
+
+	set_local_queue(core_arg->jobq);
+
+	/* this is only useful (and meaningful) is there is a single
+	   memory node "related" to that queue */
+	core_arg->jobq->memory_node = core_arg->memory_node;
+
+	core_arg->jobq->total_computation_time = 0.0;
+	core_arg->jobq->total_communication_time = 0.0;
+	
+        /* tell the main thread that we are ready */
+        sem_post(&core_arg->ready_sem);
+
+        job_t j;
+	int res;
+
+	while (machine_is_running())
+	{
+                j = pop_task();
+                if (j == NULL) continue;
+
+		/* can a core perform that task ? */
+		if (!CORE_MAY_PERFORM(j)) 
+		{
+			/* put it and the end of the queue ... XXX */
+			push_task(j);
+			continue;
+		}
+
+                res = execute_job_on_core(j, core_arg);
+		if (res != STARPU_SUCCESS) {
+			switch (res) {
+				case STARPU_FATAL:
+					assert(0);
+				case STARPU_TRYAGAIN:
+					push_task(j);
+					continue;
+				default: 
+					assert(0);
+			}
+		}
+
+		handle_job_termination(j);
+        }
+
+#ifdef DATA_STATS
+	fprintf(stderr, "CORE #%d computation %le comm %le (%lf \%%)\n", core_arg->id, core_arg->jobq->total_computation_time, core_arg->jobq->total_communication_time,  core_arg->jobq->total_communication_time*100.0/core_arg->jobq->total_computation_time);
+#endif
+
+	TRACE_WORKER_TERMINATED(FUT_CORE_KEY);
+
+	pthread_exit(NULL);
+}

+ 39 - 0
src/drivers/core/driver_core.h

@@ -0,0 +1,39 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_CORE_H__
+#define __DRIVER_CORE_H__
+
+/* to bind threads onto a given cpu */
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <common/config.h>
+#include <core/jobs.h>
+
+#include <core/perfmodel/perfmodel.h>
+#include <common/fxt.h>
+#include <datawizard/datawizard.h>
+
+#include <starpu.h>
+
+void *core_worker(void *);
+
+#ifndef NMAXCORES
+#define NMAXCORES       4
+#endif
+
+#endif //  __DRIVER_CORE_H__

+ 0 - 0
src/drivers/cuda/Makefile


Some files were not shown because too many files changed in this diff