16 years ago · 41f93730d4
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -0,0 +1,139 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+SUBDIRS =
			
 
				+
			
 
				+lib_LTLIBRARIES = libstarpu.la
			
 
				+
			
 
				+libstarpu_la_CPPFLAGS = -I$(top_srcdir)/include/
			
 
				+
			
 
				+libstarpu_la_CFLAGS = -W -Wall -Wextra
			
 
				+libstarpu_la_LIBS = -lm
			
 
				+
			
 
				+noinst_HEADERS = 						\
			
 
				+	core/dependencies/data-concurrency.h			\
			
 
				+	core/dependencies/tags.h				\
			
 
				+	core/dependencies/htable.h				\
			
 
				+	core/policies/eager-central-priority-policy.h		\
			
 
				+	core/policies/sched_policy.h				\
			
 
				+	core/policies/random-policy.h				\
			
 
				+	core/policies/eager-central-policy.h			\
			
 
				+	core/policies/deque-modeling-policy.h			\
			
 
				+	core/policies/no-prio-policy.h				\
			
 
				+	core/policies/deque-modeling-policy-data-aware.h	\
			
 
				+	core/policies/work-stealing-policy.h			\
			
 
				+	core/mechanisms/priority_queues.h			\
			
 
				+	core/mechanisms/fifo_queues.h				\
			
 
				+	core/mechanisms/deque_queues.h				\
			
 
				+	core/mechanisms/queues.h				\
			
 
				+	core/mechanisms/stack_queues.h				\
			
 
				+	core/perfmodel/perfmodel.h				\
			
 
				+	core/perfmodel/regression.h				\
			
 
				+	core/jobs.h						\
			
 
				+	core/workers.h						\
			
 
				+	datawizard/footprint.h					\
			
 
				+	datawizard/datawizard.h					\
			
 
				+	datawizard/data_request.h				\
			
 
				+	datawizard/hierarchy.h					\
			
 
				+	datawizard/progress.h					\
			
 
				+	datawizard/write_back.h					\
			
 
				+	datawizard/datastats.h					\
			
 
				+	datawizard/memalloc.h					\
			
 
				+	datawizard/data_parameters.h				\
			
 
				+	datawizard/copy-driver.h				\
			
 
				+	datawizard/coherency.h					\
			
 
				+	datawizard/interfaces/blas_interface.h			\
			
 
				+	datawizard/interfaces/csr_filters.h			\
			
 
				+	datawizard/interfaces/csc_interface.h			\
			
 
				+	datawizard/interfaces/bcsr_filters.h			\
			
 
				+	datawizard/interfaces/bcsr_interface.h			\
			
 
				+	datawizard/interfaces/data_interface.h			\
			
 
				+	datawizard/interfaces/vector_filters.h			\
			
 
				+	datawizard/interfaces/vector_interface.h		\
			
 
				+	datawizard/interfaces/blas_filters.h			\
			
 
				+	datawizard/interfaces/csr_interface.h			\
			
 
				+	common/hash.h						\
			
 
				+	common/timing.h						\
			
 
				+	common/htable32.h					\
			
 
				+	common/list.h						\
			
 
				+	common/rwlock.h						\
			
 
				+	common/fxt.h						\
			
 
				+	drivers/core/driver_core.h				\
			
 
				+	drivers/gordon/driver_gordon.h				\
			
 
				+	drivers/gordon/gordon_interface.h			\
			
 
				+	drivers/cuda/driver_cuda.h				\
			
 
				+	drivers/cuda/comp_cuda.h			
			
 
				+
			
 
				+libstarpu_la_SOURCES = 						\
			
 
				+	common/malloc.c						\
			
 
				+	common/hash.c 						\
			
 
				+	common/htable32.c					\
			
 
				+	common/mutex.c						\
			
 
				+	common/rwlock.c						\
			
 
				+	common/timing.c						\
			
 
				+	core/jobs.c						\
			
 
				+	core/workers.c						\
			
 
				+	core/dependencies/tags.c				\
			
 
				+	core/dependencies/htable.c				\
			
 
				+	core/dependencies/data-concurrency.c			\
			
 
				+	core/mechanisms/queues.c				\
			
 
				+	core/mechanisms/deque_queues.c				\
			
 
				+	core/mechanisms/priority_queues.c			\
			
 
				+	core/mechanisms/fifo_queues.c				\
			
 
				+	core/perfmodel/perfmodel_history.c			\
			
 
				+	core/perfmodel/perfmodel.c				\
			
 
				+	core/perfmodel/regression.c				\
			
 
				+	core/policies/no-prio-policy.c				\
			
 
				+	core/policies/eager-central-policy.c			\
			
 
				+	core/policies/eager-central-priority-policy.c		\
			
 
				+	core/policies/work-stealing-policy.c			\
			
 
				+	core/policies/sched_policy.c				\
			
 
				+	core/policies/deque-modeling-policy.c			\
			
 
				+	core/policies/random-policy.c				\
			
 
				+	core/policies/deque-modeling-policy-data-aware.c	\
			
 
				+	datawizard/write_back.c					\
			
 
				+	datawizard/coherency.c					\
			
 
				+	datawizard/data_request.c				\
			
 
				+	datawizard/progress.c					\
			
 
				+	datawizard/copy-driver.c				\
			
 
				+	datawizard/hierarchy.c					\
			
 
				+	datawizard/memalloc.c					\
			
 
				+	datawizard/footprint.c					\
			
 
				+	datawizard/datastats.c					\
			
 
				+	datawizard/interfaces/bcsr_interface.c			\
			
 
				+	datawizard/interfaces/csr_interface.c			\
			
 
				+	datawizard/interfaces/blas_filters.c			\
			
 
				+	datawizard/interfaces/blas_interface.c			\
			
 
				+	datawizard/interfaces/vector_interface.c		\
			
 
				+	datawizard/interfaces/bcsr_filters.c			\
			
 
				+	datawizard/interfaces/csr_filters.c			\
			
 
				+	datawizard/interfaces/vector_filters.c
			
 
				+	
			
 
				+if USE_CPU
			
 
				+libstarpu_la_SOURCES += drivers/core/driver_core.c
			
 
				+endif
			
 
				+
			
 
				+if USE_CUDA
			
 
				+libstarpu_la_SOURCES += drivers/cuda/driver_cuda.c
			
 
				+endif
			
 
				+
			
 
				+if USE_GORDON
			
 
				+libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
			
 
				+endif
			
 
				+
			
 
				+if USE_FXT
			
 
				+libstarpu_la_SOURCES += common/fxt.c
			
 
				+endif
			
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -0,0 +1,66 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/fxt.h>
			
 
				+
			
 
				+#define PROF_BUFFER_SIZE  (8*1024*1024)
			
 
				+
			
 
				+static char PROF_FILE_USER[128];
			
 
				+static int fxt_started = 0;
			
 
				+
			
 
				+void profile_stop(void)
			
 
				+{
			
 
				+	fut_endup(PROF_FILE_USER);
			
 
				+}
			
 
				+
			
 
				+void profile_set_tracefile(char *fmt, ...)
			
 
				+{
			
 
				+	va_list vl;
			
 
				+	
			
 
				+	va_start(vl, fmt);
			
 
				+	vsprintf(PROF_FILE_USER, fmt, vl);
			
 
				+	va_end(vl);
			
 
				+	strcat(PROF_FILE_USER, "_user_");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void start_fxt_profiling(void)
			
 
				+{
			
 
				+	unsigned threadid;
			
 
				+
			
 
				+	if (!fxt_started) {
			
 
				+		fxt_started = 1;
			
 
				+		profile_set_tracefile("/tmp/prof_file");
			
 
				+	}
			
 
				+
			
 
				+	threadid = syscall(SYS_gettid);
			
 
				+
			
 
				+	atexit(profile_stop);
			
 
				+
			
 
				+	if(fut_setup(PROF_BUFFER_SIZE, FUT_KEYMASKALL, threadid) < 0) {
			
 
				+		perror("fut_setup");
			
 
				+		STARPU_ASSERT(0);
			
 
				+	}
			
 
				+
			
 
				+	fut_keychange(FUT_ENABLE, FUT_KEYMASKALL, threadid);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+void fxt_register_thread(unsigned coreid)
			
 
				+{
			
 
				+	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, coreid, syscall(SYS_gettid));
			
 
				+}
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -0,0 +1,205 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __FXT_H__
			
 
				+#define __FXT_H__
			
 
				+
			
 
				+
			
 
				+#define _GNU_SOURCE  /* ou _BSD_SOURCE ou _SVID_SOURCE */
			
 
				+#include <unistd.h>
			
 
				+#include <sys/syscall.h> /* pour les définitions de SYS_xxx */
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* some key to identify the worker kind */
			
 
				+#define FUT_APPS_KEY	0x100
			
 
				+#define FUT_CORE_KEY	0x101
			
 
				+#define FUT_CUDA_KEY	0x102
			
 
				+
			
 
				+#define	FUT_NEW_WORKER_KEY	0x5102
			
 
				+#define	FUT_START_CODELET_BODY	0x5103
			
 
				+#define	FUT_END_CODELET_BODY	0x5104
			
 
				+
			
 
				+#define FUT_JOB_PUSH		0x5105
			
 
				+#define FUT_JOB_POP		0x5106
			
 
				+
			
 
				+#define FUT_START_FETCH_INPUT	0x5107
			
 
				+#define FUT_END_FETCH_INPUT	0x5108
			
 
				+#define FUT_START_PUSH_OUTPUT	0x5109
			
 
				+#define FUT_END_PUSH_OUTPUT	0x5110
			
 
				+
			
 
				+#define FUT_CODELET_TAG		0x5111
			
 
				+#define FUT_CODELET_TAG_DEPS	0x5112
			
 
				+
			
 
				+#define FUT_DATA_COPY		0x5113
			
 
				+#define FUT_WORK_STEALING	0x5114
			
 
				+
			
 
				+#define	FUT_WORKER_TERMINATED	0x5115
			
 
				+
			
 
				+#define FUT_USER_DEFINED_START	0x5116
			
 
				+#define FUT_USER_DEFINED_END	0x5117
			
 
				+
			
 
				+#define	FUT_NEW_MEM_NODE	0x5118
			
 
				+
			
 
				+#define	FUT_START_CALLBACK	0x5119
			
 
				+#define	FUT_END_CALLBACK	0x5120
			
 
				+
			
 
				+#define	FUT_TASK_DONE		0x5121
			
 
				+
			
 
				+#define	FUT_START_ALLOC		0x5122
			
 
				+#define	FUT_END_ALLOC		0x5123
			
 
				+
			
 
				+#define	FUT_START_ALLOC_REUSE	0x5128
			
 
				+#define	FUT_END_ALLOC_REUSE	0x5129
			
 
				+
			
 
				+#define	FUT_START_MEMRECLAIM	0x5124
			
 
				+#define	FUT_END_MEMRECLAIM	0x5125
			
 
				+
			
 
				+#define	FUT_START_DRIVER_COPY	0x5126
			
 
				+#define	FUT_END_DRIVER_COPY	0x5127
			
 
				+
			
 
				+
			
 
				+#ifdef USE_FXT
			
 
				+#include <fxt/fxt.h>
			
 
				+#include <fxt/fut.h>
			
 
				+
			
 
				+void start_fxt_profiling(void);
			
 
				+void fxt_register_thread(unsigned);
			
 
				+
			
 
				+/* workerkind = FUT_CORE_KEY for instance */
			
 
				+#define TRACE_NEW_MEM_NODE(nodeid)	\
			
 
				+	FUT_DO_PROBE2(FUT_NEW_MEM_NODE, nodeid, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_NEW_WORKER(workerkind,memnode)	\
			
 
				+	FUT_DO_PROBE3(FUT_NEW_WORKER_KEY, workerkind, memnode, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_CODELET_BODY(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_START_CODELET_BODY, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_END_CODELET_BODY(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_END_CODELET_BODY, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_CALLBACK(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_START_CALLBACK, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_END_CALLBACK(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_END_CALLBACK, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_JOB_PUSH(task, prio)	\
			
 
				+	FUT_DO_PROBE3(FUT_JOB_PUSH, task, prio, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_JOB_POP(task, prio)	\
			
 
				+	FUT_DO_PROBE3(FUT_JOB_POP, task, prio, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_FETCH_INPUT(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_START_FETCH_INPUT, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_END_FETCH_INPUT(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_END_FETCH_INPUT, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_PUSH_OUTPUT(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_START_PUSH_OUTPUT, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_END_PUSH_OUTPUT(job)	\
			
 
				+	FUT_DO_PROBE2(FUT_END_PUSH_OUTPUT, job, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_CODELET_TAG(tag, job)	\
			
 
				+	FUT_DO_PROBE2(FUT_CODELET_TAG, tag, job)
			
 
				+
			
 
				+#define TRACE_CODELET_TAG_DEPS(tag_child, tag_father)	\
			
 
				+	FUT_DO_PROBE2(FUT_CODELET_TAG_DEPS, tag_child, tag_father)
			
 
				+
			
 
				+#define TRACE_TASK_DONE(tag)	\
			
 
				+	FUT_DO_PROBE2(FUT_TASK_DONE, tag, syscall(SYS_gettid))
			
 
				+
			
 
				+#define TRACE_DATA_COPY(src_node, dst_node, size)	\
			
 
				+	FUT_DO_PROBE3(FUT_DATA_COPY, src_node, dst_node, size)
			
 
				+
			
 
				+#define TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id)	\
			
 
				+	FUT_DO_PROBE4(FUT_START_DRIVER_COPY, src_node, dst_node, size, com_id)
			
 
				+
			
 
				+#define TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id)	\
			
 
				+	FUT_DO_PROBE4(FUT_END_DRIVER_COPY, src_node, dst_node, size, com_id)
			
 
				+
			
 
				+#define TRACE_WORK_STEALING(empty_q, victim_q)		\
			
 
				+	FUT_DO_PROBE2(FUT_WORK_STEALING, empty_q, victim_q)
			
 
				+
			
 
				+#define TRACE_WORKER_TERMINATED(workerkind)	\
			
 
				+	FUT_DO_PROBE2(FUT_WORKER_TERMINATED, workerkind, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_USER_DEFINED_START	\
			
 
				+	FUT_DO_PROBE1(FUT_USER_DEFINED_START, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_USER_DEFINED_END		\
			
 
				+	FUT_DO_PROBE1(FUT_USER_DEFINED_END, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_ALLOC(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_START_ALLOC, memnode, syscall(SYS_gettid));
			
 
				+	
			
 
				+#define TRACE_END_ALLOC(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_END_ALLOC, memnode, syscall(SYS_gettid));
			
 
				+
			
 
				+#define TRACE_START_ALLOC_REUSE(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_START_ALLOC_REUSE, memnode, syscall(SYS_gettid));
			
 
				+	
			
 
				+#define TRACE_END_ALLOC_REUSE(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_END_ALLOC_REUSE, memnode, syscall(SYS_gettid));
			
 
				+	
			
 
				+#define TRACE_START_MEMRECLAIM(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_START_MEMRECLAIM, memnode, syscall(SYS_gettid));
			
 
				+	
			
 
				+#define TRACE_END_MEMRECLAIM(memnode)		\
			
 
				+	FUT_DO_PROBE2(FUT_END_MEMRECLAIM, memnode, syscall(SYS_gettid));
			
 
				+	
			
 
				+
			
 
				+#else // !USE_FXT
			
 
				+
			
 
				+#define TRACE_NEW_MEM_NODE(nodeid)	do {} while(0);
			
 
				+#define TRACE_NEW_WORKER(a,b)		do {} while(0);
			
 
				+#define TRACE_START_CODELET_BODY(job)	do {} while(0);
			
 
				+#define TRACE_END_CODELET_BODY(job)	do {} while(0);
			
 
				+#define TRACE_START_CALLBACK(job)	do {} while(0);
			
 
				+#define TRACE_END_CALLBACK(job)		do {} while(0);
			
 
				+#define TRACE_JOB_PUSH(task, prio)	do {} while(0);
			
 
				+#define TRACE_JOB_POP(task, prio)	do {} while(0);
			
 
				+#define TRACE_START_FETCH_INPUT(job)	do {} while(0);
			
 
				+#define TRACE_END_FETCH_INPUT(job)	do {} while(0);
			
 
				+#define TRACE_START_PUSH_OUTPUT(job)	do {} while(0);
			
 
				+#define TRACE_END_PUSH_OUTPUT(job)	do {} while(0);
			
 
				+#define TRACE_CODELET_TAG(tag, job)	do {} while(0);
			
 
				+#define TRACE_CODELET_TAG_DEPS(a, b)	do {} while(0);
			
 
				+#define TRACE_TASK_DONE(tag)		do {} while(0);
			
 
				+#define TRACE_DATA_COPY(a, b, c)	do {} while(0);
			
 
				+#define TRACE_START_DRIVER_COPY(a,b,c,d)	do {} while(0);
			
 
				+#define TRACE_END_DRIVER_COPY(a,b,c,d)	do {} while(0);
			
 
				+#define TRACE_WORK_STEALING(a, b)	do {} while(0);
			
 
				+#define TRACE_WORKER_TERMINATED(a)	do {} while(0);
			
 
				+#define TRACE_USER_DEFINED_START	do {} while(0);
			
 
				+#define TRACE_USER_DEFINED_END		do {} while(0);
			
 
				+#define TRACE_START_ALLOC(memnode)	do {} while(0);
			
 
				+#define TRACE_END_ALLOC(memnode)	do {} while(0);
			
 
				+#define TRACE_START_ALLOC_REUSE(a)	do {} while(0);
			
 
				+#define TRACE_END_ALLOC_REUSE(a)	do {} while(0);
			
 
				+#define TRACE_START_MEMRECLAIM(memnode)	do {} while(0);
			
 
				+#define TRACE_END_MEMRECLAIM(memnode)	do {} while(0);
			
 
				+
			
 
				+#endif // USE_FXT
			
 
				+
			
 
				+#endif // __FXT_H__
			
--- a/src/common/hash.c
+++ b/src/common/hash.c
@@ -0,0 +1,45 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/hash.h>
			
 
				+
			
 
				+#define CRC32C_POLY_BE 0x1EDC6F41
			
 
				+
			
 
				+static inline uint32_t __attribute__ ((pure)) crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	uint32_t crc;
			
 
				+
			
 
				+	crc = inputcrc ^ (inputbyte << 24);
			
 
				+	for (i = 0; i < 8; i++)
			
 
				+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE : 0);
			
 
				+
			
 
				+	return crc;
			
 
				+}
			
 
				+
			
 
				+uint32_t crc32_be(uint32_t input, uint32_t inputcrc)
			
 
				+{
			
 
				+	uint8_t *p = (uint8_t *)&input;
			
 
				+
			
 
				+	uint32_t crc = inputcrc;
			
 
				+
			
 
				+	crc = crc32_be_8(p[0], crc);
			
 
				+	crc = crc32_be_8(p[1], crc);
			
 
				+	crc = crc32_be_8(p[2], crc);
			
 
				+	crc = crc32_be_8(p[3], crc);
			
 
				+
			
 
				+	return crc;
			
 
				+}
			
--- a/src/common/hash.h
+++ b/src/common/hash.h
@@ -0,0 +1,24 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __HASH_H__
			
 
				+#define __HASH_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+uint32_t crc32_be(uint32_t input, uint32_t inputcrc);
			
 
				+
			
 
				+#endif // __HASH_H__
			
--- a/src/common/htable32.c
+++ b/src/common/htable32.c
@@ -0,0 +1,101 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/htable32.h>
			
 
				+#include <stdint.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+void *htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key)
			
 
				+{
			
 
				+	unsigned currentbit;
			
 
				+	unsigned keysize = 32;
			
 
				+
			
 
				+	htbl32_node_t *current_htbl = htbl;
			
 
				+
			
 
				+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				+	uint32_t mask = (1<<HTBL32_NODE_SIZE)-1;
			
 
				+
			
 
				+	for(currentbit = 0; currentbit < keysize; currentbit+=HTBL32_NODE_SIZE)
			
 
				+	{
			
 
				+	
			
 
				+	//	printf("search : current bit = %d \n", currentbit);
			
 
				+		if (current_htbl == NULL)
			
 
				+			return NULL;
			
 
				+
			
 
				+		/* 0000000000001111 
			
 
				+		 *     | currentbit
			
 
				+		 * 0000111100000000 = offloaded_mask
			
 
				+		 *         |last_currentbit
			
 
				+		 * */
			
 
				+
			
 
				+		unsigned last_currentbit = 
			
 
				+			keysize - (currentbit + HTBL32_NODE_SIZE);
			
 
				+		uint32_t offloaded_mask = mask << last_currentbit;
			
 
				+		unsigned current_index = 
			
 
				+			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				+
			
 
				+		current_htbl = current_htbl->children[current_index];
			
 
				+	}
			
 
				+
			
 
				+	return current_htbl;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * returns the previous value of the tag, or NULL else
			
 
				+ */
			
 
				+
			
 
				+void *htbl_insert_32(struct starpu_htbl32_node_s **htbl, uint32_t key, void *entry)
			
 
				+{
			
 
				+	unsigned currentbit;
			
 
				+	unsigned keysize = 32;
			
 
				+
			
 
				+	htbl32_node_t **current_htbl_ptr = htbl;
			
 
				+
			
 
				+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				+	uint32_t mask = (1<<HTBL32_NODE_SIZE)-1;
			
 
				+
			
 
				+	for(currentbit = 0; currentbit < keysize; currentbit+=HTBL32_NODE_SIZE)
			
 
				+	{
			
 
				+		//printf("insert : current bit = %d \n", currentbit);
			
 
				+		if (*current_htbl_ptr == NULL) {
			
 
				+			/* TODO pad to change that 1 into 16 ? */
			
 
				+			*current_htbl_ptr = calloc(sizeof(htbl32_node_t), 1);
			
 
				+			assert(*current_htbl_ptr);
			
 
				+		}
			
 
				+
			
 
				+		/* 0000000000001111 
			
 
				+		 *     | currentbit
			
 
				+		 * 0000111100000000 = offloaded_mask
			
 
				+		 *         |last_currentbit
			
 
				+		 * */
			
 
				+
			
 
				+		unsigned last_currentbit = 
			
 
				+			keysize - (currentbit + HTBL32_NODE_SIZE);
			
 
				+		uint32_t offloaded_mask = mask << last_currentbit;
			
 
				+		unsigned current_index = 
			
 
				+			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				+
			
 
				+		current_htbl_ptr = 
			
 
				+			&((*current_htbl_ptr)->children[current_index]);
			
 
				+	}
			
 
				+
			
 
				+	/* current_htbl either contains NULL or a previous entry 
			
 
				+	 * we overwrite it anyway */
			
 
				+	void *old_entry = *current_htbl_ptr;
			
 
				+	*current_htbl_ptr = entry;
			
 
				+
			
 
				+	return old_entry;
			
 
				+}
			
--- a/src/common/htable32.h
+++ b/src/common/htable32.h
@@ -0,0 +1,35 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __GENERIC_HTABLE_H__
			
 
				+#define __GENERIC_HTABLE_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+
			
 
				+#define HTBL32_NODE_SIZE	16
			
 
				+
			
 
				+typedef struct starpu_htbl32_node_s {
			
 
				+	unsigned nentries;
			
 
				+	struct starpu_htbl32_node_s *children[1<<HTBL32_NODE_SIZE];
			
 
				+} htbl32_node_t;
			
 
				+
			
 
				+void *htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key);
			
 
				+void *htbl_insert_32(struct starpu_htbl32_node_s **htbl, uint32_t key, void *entry);
			
 
				+
			
 
				+#endif // __GENERIC_HTABLE_H__
			
--- a/src/common/list.h
+++ b/src/common/list.h
@@ -0,0 +1,167 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/** @file
			
 
				+ * @brief Listes doublement chainées automatiques
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/** @remarks list how-to
			
 
				+ * *********************************************************
			
 
				+ * LIST_TYPE(FOO, contenu);
			
 
				+ *  - déclare les types suivants
			
 
				+ *      + pour les cellules : FOO_t
			
 
				+ *      + pour les listes : FOO_list_t
			
 
				+ *      + pour les itérateurs : FOO_itor_t
			
 
				+ *  - déclare les accesseurs suivants :
			
 
				+ *     * création d'une cellule 
			
 
				+ *   FOO_t      FOO_new(void);  
			
 
				+ *     * suppression d'une cellule
			
 
				+ *   void       FOO_delete(FOO_t); 
			
 
				+ *     * création d'une liste (vide)
			
 
				+ *   FOO_list_t FOO_list_new(void);
			
 
				+ *     * suppression d'une liste
			
 
				+ *   void       FOO_list_delete(FOO_list_t);
			
 
				+ *     * teste si une liste est vide
			
 
				+ *   int        FOO_list_empty(FOO_list_t);
			
 
				+ *     * retire un élément de la liste
			
 
				+ *   void       FOO_list_erase(FOO_list_t, FOO_t);
			
 
				+ *     * ajoute une élément en queue de liste
			
 
				+ *   void       FOO_list_push_back(FOO_list_t, FOO_t);
			
 
				+ *     * ajoute un élément en tête de list
			
 
				+ *   void       FOO_list_push_front(FOO_list_t, FOO_t);
			
 
				+ *     * retire l'élément en queue de liste
			
 
				+ *   FOO_t      FOO_list_pop_back(FOO_list_t);
			
 
				+ *     * retire l'élement en tête de liste
			
 
				+ *   FOO_t      FOO_list_pop_front(FOO_list_t);
			
 
				+ *     * retourne l'élément en queue de liste
			
 
				+ *   FOO_t      FOO_list_back(FOO_list_t);
			
 
				+ *     * retourne l'élement en tête de liste
			
 
				+ *   FOO_t      FOO_list_front(FOO_list_t);
			
 
				+ *     * vérifie si la liste chainée est cohérente
			
 
				+ *   int	FOO_list_check(FOO_list_t);
			
 
				+ * *********************************************************
			
 
				+ * Exemples d'utilisation :
			
 
				+ *  - au départ, on a :
			
 
				+ *    struct ma_structure_s
			
 
				+ *    {
			
 
				+ *      int a;
			
 
				+ *      int b;
			
 
				+ *    };
			
 
				+ *  - on veut en faire une liste. On remplace la déclaration par :
			
 
				+ *    LIST_TYPE(ma_structure,
			
 
				+ *      int a;
			
 
				+ *      int b;
			
 
				+ *    );
			
 
				+ *    qui crée les types ma_structure_t et ma_structure_list_t.
			
 
				+ *  - allocation d'une liste vide :
			
 
				+ *  ma_structure_list_t l = ma_structure_list_new();
			
 
				+ *  - ajouter un élément 'e' en tête de la liste 'l' :
			
 
				+ *  ma_structure_t e = ma_structure_new();
			
 
				+ *  e->a = 0;
			
 
				+ *  e->b = 1;
			
 
				+ *  ma_structure_list_push_front(l, e);
			
 
				+ *  - itérateur de liste :
			
 
				+ *  ma_structure_itor_t i;
			
 
				+ *  for(i  = ma_structure_list_begin(l);
			
 
				+ *      i != ma_structure_list_end(l);
			
 
				+ *      i  = ma_structure_list_next(i))
			
 
				+ *  {
			
 
				+ *    printf("a=%d; b=%d\n", i->a, i->b);
			
 
				+ *  }
			
 
				+ * *********************************************************
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**@hideinitializer
			
 
				+ * Generates a new type for list of elements */
			
 
				+#define LIST_TYPE(ENAME, DECL) \
			
 
				+  LIST_DECLARE_TYPE(ENAME) \
			
 
				+  LIST_CREATE_TYPE(ENAME, DECL)
			
 
				+
			
 
				+/**@hideinitializer
			
 
				+ * Forward type declaration for lists */
			
 
				+#define LIST_DECLARE_TYPE(ENAME) \
			
 
				+  /** automatic type: ENAME##_list_t is a list of ENAME##_t */ \
			
 
				+  typedef struct ENAME##_list_s* ENAME##_list_t; \
			
 
				+  /** automatic type: defines ENAME##_t */ \
			
 
				+  typedef struct ENAME##_s* ENAME##_t; \
			
 
				+  /** automatic type: ENAME##_itor_t is an iterator on lists of ENAME##_t */ \
			
 
				+  typedef ENAME##_t ENAME##_itor_t;
			
 
				+
			
 
				+/**@hideinitializer
			
 
				+ * The effective type declaration for lists */
			
 
				+#define LIST_CREATE_TYPE(ENAME, DECL) \
			
 
				+  /** from automatic type: ENAME##_t */ \
			
 
				+  struct ENAME##_s \
			
 
				+  { \
			
 
				+    struct ENAME##_s*_prev; /**< @internal previous cell */ \
			
 
				+    struct ENAME##_s*_next; /**< @internal next cell */ \
			
 
				+    DECL \
			
 
				+  }; \
			
 
				+  /** @internal */ \
			
 
				+  struct ENAME##_list_s \
			
 
				+  { \
			
 
				+    struct ENAME##_s* _head; /**< @internal head of the list */ \
			
 
				+    struct ENAME##_s* _tail; /**< @internal tail of the list */ \
			
 
				+  }; \
			
 
				+  /** @internal */static inline ENAME##_t ENAME##_new(void) \
			
 
				+    { ENAME##_t e = (ENAME##_t)malloc(sizeof(struct ENAME##_s)); \
			
 
				+      e->_next = NULL; e->_prev = NULL; return e; } \
			
 
				+  /** @internal */static inline void ENAME##_delete(ENAME##_t e) \
			
 
				+    { free(e); } \
			
 
				+  /** @internal */static inline void ENAME##_list_push_front(ENAME##_list_t l, ENAME##_t e) \
			
 
				+    { if(l->_tail == NULL) l->_tail = e; else l->_head->_prev = e; \
			
 
				+      e->_prev = NULL; e->_next = l->_head; l->_head = e; } \
			
 
				+  /** @internal */static inline void ENAME##_list_push_back(ENAME##_list_t l, ENAME##_t e) \
			
 
				+    { if(l->_head == NULL) l->_head = e; else l->_tail->_next = e; \
			
 
				+      e->_next = NULL; e->_prev = l->_tail; l->_tail = e; } \
			
 
				+  /** @internal */static inline ENAME##_t ENAME##_list_front(ENAME##_list_t l) \
			
 
				+    { return l->_head; } \
			
 
				+  /** @internal */static inline ENAME##_t ENAME##_list_back(ENAME##_list_t l) \
			
 
				+    { return l->_tail; } \
			
 
				+  /** @internal */static inline ENAME##_list_t ENAME##_list_new(void) \
			
 
				+    { ENAME##_list_t l; l=(ENAME##_list_t)malloc(sizeof(struct ENAME##_list_s)); \
			
 
				+      l->_head=NULL; l->_tail=l->_head; return l; } \
			
 
				+  /** @internal */static inline int ENAME##_list_empty(ENAME##_list_t l) \
			
 
				+    { return (l->_head == NULL); } \
			
 
				+  /** @internal */static inline void ENAME##_list_delete(ENAME##_list_t l) \
			
 
				+    { free(l); } \
			
 
				+  /** @internal */static inline void ENAME##_list_erase(ENAME##_list_t l, ENAME##_t c) \
			
 
				+    { ENAME##_t p = c->_prev; if(p) p->_next = c->_next; else l->_head = c->_next; \
			
 
				+      if(c->_next) c->_next->_prev = p; else l->_tail = p; } \
			
 
				+  /** @internal */static inline ENAME##_t ENAME##_list_pop_front(ENAME##_list_t l) \
			
 
				+    { ENAME##_t e = ENAME##_list_front(l); \
			
 
				+      ENAME##_list_erase(l, e); return e; } \
			
 
				+  /** @internal */static inline ENAME##_t ENAME##_list_pop_back(ENAME##_list_t l) \
			
 
				+    { ENAME##_t e = ENAME##_list_back(l); \
			
 
				+      ENAME##_list_erase(l, e); return e; } \
			
 
				+  /** @internal */static inline ENAME##_itor_t ENAME##_list_begin(ENAME##_list_t l) \
			
 
				+    { return l->_head; } \
			
 
				+  /** @internal */static inline ENAME##_itor_t ENAME##_list_end(ENAME##_list_t l __attribute__ ((unused))) \
			
 
				+    { return NULL; } \
			
 
				+  /** @internal */static inline ENAME##_itor_t ENAME##_list_next(ENAME##_itor_t i) \
			
 
				+    { return i->_next; } \
			
 
				+  /** @internal */static inline int ENAME##_list_size(ENAME##_list_t l) \
			
 
				+    { ENAME##_itor_t i=l->_head; int k=0; while(i!=NULL){k++;i=i->_next;} return k; } \
			
 
				+  /** @internal */static inline int ENAME##_list_check(ENAME##_list_t l) \
			
 
				+    { ENAME##_itor_t i=l->_head; while(i) \
			
 
				+    { if ((i->_next == NULL) && i != l->_tail) return 0; \
			
 
				+      if (i->_next == i) return 0; \
			
 
				+      i=i->_next;} return 1; }
			
 
				+
			
 
				+
			
--- a/src/common/malloc.c
+++ b/src/common/malloc.c
@@ -0,0 +1,80 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <errno.h>
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+/* This method is not optimal at all, but it makes life much easier in many codes */
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+struct data_interface_s;
			
 
				+
			
 
				+struct malloc_pinned_codelet_struct {
			
 
				+	float **ptr;
			
 
				+	size_t dim;
			
 
				+};
			
 
				+
			
 
				+static void malloc_pinned_codelet(struct data_interface_s *buffers __attribute__((unused)), void *arg)
			
 
				+{
			
 
				+	struct malloc_pinned_codelet_struct *s = arg;
			
 
				+
			
 
				+	cuMemAllocHost((void **)(s->ptr), s->dim);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void starpu_malloc_pinned_if_possible(float **A, size_t dim)
			
 
				+{
			
 
				+	if (may_submit_cuda_task())
			
 
				+	{
			
 
				+#ifdef USE_CUDA
			
 
				+		int push_res;
			
 
				+	
			
 
				+		struct malloc_pinned_codelet_struct s = {
			
 
				+			.ptr = A,
			
 
				+			.dim = dim
			
 
				+		};	
			
 
				+	
			
 
				+		starpu_codelet *cl = malloc(sizeof(starpu_codelet));
			
 
				+			cl->cublas_func = malloc_pinned_codelet; 
			
 
				+			cl->where = CUBLAS;
			
 
				+			cl->model = NULL;
			
 
				+			cl->nbuffers = 0;
			
 
				+	
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+			task->callback_func = NULL; 
			
 
				+			task->cl = cl;
			
 
				+			task->cl_arg = &s;
			
 
				+
			
 
				+		task->synchronous = 1;
			
 
				+	
			
 
				+		push_res = starpu_submit_task(task);
			
 
				+		STARPU_ASSERT(push_res != -ENODEV);
			
 
				+
			
 
				+		free(cl);
			
 
				+		free(task);
			
 
				+#endif
			
 
				+	}
			
 
				+	else {
			
 
				+		*A = malloc(dim);
			
 
				+	}
			
 
				+}
			
--- a/src/common/mutex.c
+++ b/src/common/mutex.c
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu-mutex.h>
			
 
				+
			
 
				+void init_mutex(starpu_mutex *m)
			
 
				+{
			
 
				+	/* this is free at first */
			
 
				+	m->taken = 0;
			
 
				+}
			
 
				+
			
 
				+inline int take_mutex_try(starpu_mutex *m)
			
 
				+{
			
 
				+	uint32_t prev;
			
 
				+	prev = __sync_lock_test_and_set(&m->taken, 1);
			
 
				+	return (prev == 0)?0:-1;
			
 
				+}
			
 
				+
			
 
				+inline void take_mutex(starpu_mutex *m)
			
 
				+{
			
 
				+	uint32_t prev;
			
 
				+	do {
			
 
				+		prev = __sync_lock_test_and_set(&m->taken, 1);
			
 
				+	} while (prev);
			
 
				+}
			
 
				+
			
 
				+inline void release_mutex(starpu_mutex *m)
			
 
				+{
			
 
				+	m->taken = 0;
			
 
				+}
			
--- a/src/common/rwlock.c
+++ b/src/common/rwlock.c
@@ -0,0 +1,151 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * A dummy implementation of a rw_lock using spinlocks ...
			
 
				+ */ 
			
 
				+
			
 
				+#include "rwlock.h"
			
 
				+
			
 
				+static void _take_busy_lock(rw_lock *lock)
			
 
				+{
			
 
				+	uint32_t prev;
			
 
				+	do {
			
 
				+		prev = __sync_lock_test_and_set(&lock->busy, 1);
			
 
				+	} while (prev);
			
 
				+}
			
 
				+
			
 
				+static void _release_busy_lock(rw_lock *lock)
			
 
				+{
			
 
				+	lock->busy = 0;
			
 
				+}
			
 
				+
			
 
				+void init_rw_lock(rw_lock *lock)
			
 
				+{
			
 
				+	STARPU_ASSERT(lock);
			
 
				+
			
 
				+	lock->writer = 0;
			
 
				+	lock->readercnt = 0;
			
 
				+	lock->busy = 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int take_rw_lock_write_try(rw_lock *lock)
			
 
				+{
			
 
				+	_take_busy_lock(lock);
			
 
				+	
			
 
				+	if (lock->readercnt > 0 || lock->writer)
			
 
				+	{
			
 
				+		/* fail to take the lock */
			
 
				+		_release_busy_lock(lock);
			
 
				+		return -1;
			
 
				+	}
			
 
				+	else {
			
 
				+		STARPU_ASSERT(lock->readercnt == 0);
			
 
				+		STARPU_ASSERT(lock->writer == 0);
			
 
				+
			
 
				+		/* no one was either writing nor reading */
			
 
				+		lock->writer = 1;
			
 
				+		_release_busy_lock(lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int take_rw_lock_read_try(rw_lock *lock)
			
 
				+{
			
 
				+	_take_busy_lock(lock);
			
 
				+
			
 
				+	if (lock->writer)
			
 
				+	{
			
 
				+		/* there is a writer ... */
			
 
				+		_release_busy_lock(lock);
			
 
				+		return -1;
			
 
				+	}
			
 
				+	else {
			
 
				+		STARPU_ASSERT(lock->writer == 0);
			
 
				+
			
 
				+		/* no one is writing */
			
 
				+		/* XXX check wrap arounds ... */
			
 
				+		lock->readercnt++;
			
 
				+		_release_busy_lock(lock);
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void take_rw_lock_write(rw_lock *lock)
			
 
				+{
			
 
				+	do {
			
 
				+		_take_busy_lock(lock);
			
 
				+		
			
 
				+		if (lock->readercnt > 0 || lock->writer)
			
 
				+		{
			
 
				+			/* fail to take the lock */
			
 
				+			_release_busy_lock(lock);
			
 
				+		}
			
 
				+		else {
			
 
				+			STARPU_ASSERT(lock->readercnt == 0);
			
 
				+			STARPU_ASSERT(lock->writer == 0);
			
 
				+	
			
 
				+			/* no one was either writing nor reading */
			
 
				+			lock->writer = 1;
			
 
				+			_release_busy_lock(lock);
			
 
				+			return;
			
 
				+		}
			
 
				+	} while (1);
			
 
				+}
			
 
				+
			
 
				+void take_rw_lock_read(rw_lock *lock)
			
 
				+{
			
 
				+	do {
			
 
				+		_take_busy_lock(lock);
			
 
				+
			
 
				+		if (lock->writer)
			
 
				+		{
			
 
				+			/* there is a writer ... */
			
 
				+			_release_busy_lock(lock);
			
 
				+		}
			
 
				+		else {
			
 
				+			STARPU_ASSERT(lock->writer == 0);
			
 
				+
			
 
				+			/* no one is writing */
			
 
				+			/* XXX check wrap arounds ... */
			
 
				+			lock->readercnt++;
			
 
				+			_release_busy_lock(lock);
			
 
				+
			
 
				+			return;
			
 
				+		}
			
 
				+	} while (1);
			
 
				+}
			
 
				+
			
 
				+void release_rw_lock(rw_lock *lock)
			
 
				+{
			
 
				+	_take_busy_lock(lock);
			
 
				+	/* either writer or reader (exactly one !) */
			
 
				+	if (lock->writer) 
			
 
				+	{
			
 
				+		STARPU_ASSERT(lock->readercnt == 0);
			
 
				+		lock->writer = 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		/* reading mode */
			
 
				+		STARPU_ASSERT(lock->writer == 0);
			
 
				+		lock->readercnt--;
			
 
				+	}
			
 
				+	_release_busy_lock(lock);
			
 
				+}
			
--- a/src/common/rwlock.h
+++ b/src/common/rwlock.h
@@ -0,0 +1,40 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __RWLOCKS_H__
			
 
				+#define __RWLOCKS_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+typedef struct rw_lock_t {
			
 
				+	uint32_t busy;
			
 
				+	uint8_t writer;
			
 
				+	uint16_t readercnt;
			
 
				+} rw_lock;
			
 
				+
			
 
				+void init_rw_lock(rw_lock *lock);
			
 
				+void take_rw_lock_write(rw_lock *lock);
			
 
				+void take_rw_lock_read(rw_lock *lock);
			
 
				+int take_rw_lock_write_try(rw_lock *lock);
			
 
				+int take_rw_lock_read_try(rw_lock *lock);
			
 
				+void release_rw_lock(rw_lock *lock);
			
 
				+
			
 
				+///* make sure to have the lock before using that function */
			
 
				+//inline uint8_t rw_lock_is_writer(rw_lock *lock);
			
 
				+//unsigned is_rw_lock_referenced(rw_lock *lock);
			
 
				+
			
 
				+#endif
			
--- a/src/common/timing.c
+++ b/src/common/timing.c
@@ -0,0 +1,144 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "timing.h"
			
 
				+
			
 
				+#ifdef UNRELIABLETICKS
			
 
				+
			
 
				+#define TICK_RAW_DIFF(t1, t2) (((t2).tv.tv_sec*1e6 + (t2).tv.tv_usec) + \
			
 
				+				- ((t1).tv.tv_sec*1e6) + (t1).tv.tv_usec)
			
 
				+#define TICK_DIFF(t1, t2) (TICK_RAW_DIFF(t1, t2) - residual)
			
 
				+#define TIMING_DELAY(t1, t2) tick2usec(TICK_DIFF(t1, t2))
			
 
				+
			
 
				+static double scale = 0.0;
			
 
				+static unsigned long long residual = 0;
			
 
				+
			
 
				+static int inited = 0;
			
 
				+
			
 
				+void timing_init(void)
			
 
				+{
			
 
				+  static tick_t t1, t2;
			
 
				+  int i;
			
 
				+
			
 
				+  if (inited) return;
			
 
				+
			
 
				+  residual = (unsigned long long)1 << 63;
			
 
				+  
			
 
				+  for(i = 0; i < 20; i++)
			
 
				+    {
			
 
				+      GET_TICK(t1);
			
 
				+      GET_TICK(t2);
			
 
				+      residual = STARPU_MIN(residual, TICK_RAW_DIFF(t1, t2));
			
 
				+    }
			
 
				+  
			
 
				+  {
			
 
				+    struct timeval tv1,tv2;
			
 
				+    
			
 
				+    GET_TICK(t1);
			
 
				+    gettimeofday(&tv1,0);
			
 
				+    usleep(500000);
			
 
				+    GET_TICK(t2);
			
 
				+    gettimeofday(&tv2,0);
			
 
				+    scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -
			
 
				+	     (tv1.tv_sec*1e6 + tv1.tv_usec)) / 
			
 
				+      (double)(TICK_DIFF(t1, t2));
			
 
				+  }
			
 
				+
			
 
				+  inited = 1;
			
 
				+}
			
 
				+
			
 
				+inline double tick2usec(long long t)
			
 
				+{
			
 
				+  return (double)(t)*scale;
			
 
				+}
			
 
				+
			
 
				+inline double timing_delay(tick_t *t1, tick_t *t2)
			
 
				+{
			
 
				+	return TIMING_DELAY(*t1, *t2);
			
 
				+}
			
 
				+
			
 
				+inline double timing_now(void)
			
 
				+{
			
 
				+	tick_t tick_now;
			
 
				+	GET_TICK(tick_now);
			
 
				+
			
 
				+	return tick2usec(scale*((tick_now).tv.tv_sec*1e6) + (tick_now).tv.tv_usec);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#else // UNRELIABLETICKS
			
 
				+
			
 
				+#define TICK_RAW_DIFF(t1, t2) ((t2).tick - (t1).tick)
			
 
				+#define TICK_DIFF(t1, t2) (TICK_RAW_DIFF(t1, t2) - residual)
			
 
				+#define TIMING_DELAY(t1, t2) tick2usec(TICK_DIFF(t1, t2))
			
 
				+
			
 
				+static double scale = 0.0;
			
 
				+static unsigned long long residual = 0;
			
 
				+
			
 
				+static int inited = 0;
			
 
				+
			
 
				+void timing_init(void)
			
 
				+{
			
 
				+  static tick_t t1, t2;
			
 
				+  int i;
			
 
				+
			
 
				+  if (inited) return;
			
 
				+
			
 
				+  residual = (unsigned long long)1 << 63;
			
 
				+  
			
 
				+  for(i = 0; i < 20; i++)
			
 
				+    {
			
 
				+      GET_TICK(t1);
			
 
				+      GET_TICK(t2);
			
 
				+      residual = STARPU_MIN(residual, TICK_RAW_DIFF(t1, t2));
			
 
				+    }
			
 
				+  
			
 
				+  {
			
 
				+    struct timeval tv1,tv2;
			
 
				+    
			
 
				+    GET_TICK(t1);
			
 
				+    gettimeofday(&tv1,0);
			
 
				+    usleep(500000);
			
 
				+    GET_TICK(t2);
			
 
				+    gettimeofday(&tv2,0);
			
 
				+    scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -
			
 
				+	     (tv1.tv_sec*1e6 + tv1.tv_usec)) / 
			
 
				+      (double)(TICK_DIFF(t1, t2));
			
 
				+  }
			
 
				+
			
 
				+  inited = 1;
			
 
				+}
			
 
				+
			
 
				+inline double tick2usec(long long t)
			
 
				+{
			
 
				+  return (double)(t)*scale;
			
 
				+}
			
 
				+
			
 
				+inline double timing_delay(tick_t *t1, tick_t *t2)
			
 
				+{
			
 
				+	return TIMING_DELAY(*t1, *t2);
			
 
				+}
			
 
				+
			
 
				+inline double timing_now(void)
			
 
				+{
			
 
				+	tick_t tick_now;
			
 
				+	GET_TICK(tick_now);
			
 
				+
			
 
				+	return tick2usec(tick_now.tick);
			
 
				+}
			
 
				+
			
 
				+#endif // UNRELIABLETICKS
			
--- a/src/common/timing.h
+++ b/src/common/timing.h
@@ -0,0 +1,77 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef TIMING_H
			
 
				+#define TIMING_H
			
 
				+
			
 
				+/*
			
 
				+ * -- Initialiser la bibliothèque avec timing_init();
			
 
				+ * -- Mémoriser un timestamp :
			
 
				+ *  tick_t t;
			
 
				+ *  GET_TICK(t);
			
 
				+ * -- Calculer un intervalle en microsecondes :
			
 
				+ *  TIMING_DELAY(t1, t2);
			
 
				+ */
			
 
				+
			
 
				+#include <sys/time.h>
			
 
				+#include <unistd.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef UNRELIABLETICKS
			
 
				+
			
 
				+/* we use the usual gettimeofday method */
			
 
				+typedef struct tick_s
			
 
				+{
			
 
				+	struct timeval tv;
			
 
				+} tick_t;
			
 
				+
			
 
				+#define GET_TICK(t) gettimeofday(&((t).tv), NULL)
			
 
				+
			
 
				+#else // !UNRELIABLETICKS
			
 
				+
			
 
				+typedef union u_tick
			
 
				+{
			
 
				+  uint64_t tick;
			
 
				+
			
 
				+  struct
			
 
				+  {
			
 
				+    uint32_t low;
			
 
				+    uint32_t high;
			
 
				+  }
			
 
				+  sub;
			
 
				+} tick_t;
			
 
				+
			
 
				+#if defined(__i386__) || defined(__pentium__) || defined(__pentiumpro__) || defined(__i586__) || defined(__i686__) || defined(__k6__) || defined(__k7__) || defined(__x86_64__)
			
 
				+#  define GET_TICK(t) __asm__ volatile("rdtsc" : "=a" ((t).sub.low), "=d" ((t).sub.high))
			
 
				+#else
			
 
				+//#  error "Processeur non-supporté par timing.h"
			
 
				+/* XXX */
			
 
				+//#warning "unsupported processor GET_TICK returns 0"
			
 
				+#  define GET_TICK(t) do {} while(0);
			
 
				+#endif
			
 
				+
			
 
				+#endif // UNRELIABLETICKS
			
 
				+
			
 
				+void __attribute__ ((unused)) timing_init(void);
			
 
				+inline double __attribute__ ((unused)) tick2usec(long long t);
			
 
				+inline double __attribute__ ((unused)) timing_delay(tick_t *t1, tick_t *t2);
			
 
				+
			
 
				+inline double __attribute__ ((unused)) timing_now(void);
			
 
				+
			
 
				+#endif /* TIMING_H */
			
 
				+
			
 
				+
			
--- a/src/core/dependencies/data-concurrency.c
+++ b/src/core/dependencies/data-concurrency.c
@@ -0,0 +1,227 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/dependencies/data-concurrency.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+
			
 
				+static unsigned _submit_job_enforce_data_deps(job_t j, unsigned start_buffer_index);
			
 
				+
			
 
				+static unsigned unlock_one_requester(data_requester_t r)
			
 
				+{
			
 
				+	job_t j = r->j;
			
 
				+	unsigned nbuffers = j->task->cl->nbuffers;
			
 
				+	unsigned buffer_index = r->buffer_index;
			
 
				+
			
 
				+	if (buffer_index + 1 < nbuffers)
			
 
				+	{
			
 
				+		/* not all buffers are protected yet */
			
 
				+		return _submit_job_enforce_data_deps(j, buffer_index + 1);
			
 
				+	}
			
 
				+	else
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+/* the header lock must be taken by the caller */
			
 
				+static unsigned may_unlock_data_req_list_head(data_state *data)
			
 
				+{
			
 
				+	/* if there is no one to unlock ... */
			
 
				+	if (data_requester_list_empty(data->req_list))
			
 
				+		return 0;
			
 
				+
			
 
				+	/* if there is no reference to the data anymore, we can use it */
			
 
				+	if (data->refcnt == 0)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (data->current_mode == W)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* data->current_mode == R, so we can process more readers */
			
 
				+	data_requester_t r = data_requester_list_front(data->req_list);
			
 
				+	
			
 
				+	return (r->mode == R);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_access_mode mode,
			
 
				+						void (*callback)(void *), void *argcb)
			
 
				+{
			
 
				+	unsigned ret;
			
 
				+
			
 
				+	take_mutex(&data->header_lock);
			
 
				+
			
 
				+	if (data->refcnt == 0)
			
 
				+	{
			
 
				+		/* there is nobody currently about to manipulate the data */
			
 
				+		data->refcnt++;
			
 
				+		data->current_mode = mode;
			
 
				+
			
 
				+		/* success */
			
 
				+		ret = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* there is already someone that may access the data */
			
 
				+		if ( (mode == R) && (data->current_mode == R))
			
 
				+		{
			
 
				+			data->refcnt++;
			
 
				+
			
 
				+			/* success : there is a new reader */
			
 
				+			ret = 0;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* there cannot be multiple writers or a new writer
			
 
				+			 * while the data is in read mode */
			
 
				+			
			
 
				+			/* enqueue the request */
			
 
				+			data_requester_t r = data_requester_new();
			
 
				+				r->mode = mode;
			
 
				+				r->is_requested_by_codelet = 0;
			
 
				+				r->ready_data_callback = callback;
			
 
				+				r->argcb = argcb;
			
 
				+
			
 
				+			data_requester_list_push_back(data->req_list, r);
			
 
				+
			
 
				+			/* failed */
			
 
				+			ret = 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&data->header_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer_index)
			
 
				+{
			
 
				+	unsigned ret;
			
 
				+
			
 
				+	data_state *data = j->task->buffers[buffer_index].state;
			
 
				+	starpu_access_mode mode = j->task->buffers[buffer_index].mode;
			
 
				+
			
 
				+	take_mutex(&data->header_lock);
			
 
				+
			
 
				+	if (data->refcnt == 0)
			
 
				+	{
			
 
				+		/* there is nobody currently about to manipulate the data */
			
 
				+		data->refcnt++;
			
 
				+		data->current_mode = mode;
			
 
				+
			
 
				+		/* success */
			
 
				+		ret = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* there is already someone that may access the data */
			
 
				+		if ( (mode == R) && (data->current_mode == R))
			
 
				+		{
			
 
				+			data->refcnt++;
			
 
				+
			
 
				+			/* success : there is a new reader */
			
 
				+			ret = 0;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* there cannot be multiple writers or a new writer
			
 
				+			 * while the data is in read mode */
			
 
				+			
			
 
				+			/* enqueue the request */
			
 
				+			data_requester_t r = data_requester_new();
			
 
				+				r->mode = mode;
			
 
				+				r->is_requested_by_codelet = 1;
			
 
				+				r->j = j;
			
 
				+				r->buffer_index = buffer_index;
			
 
				+
			
 
				+			data_requester_list_push_back(data->req_list, r);
			
 
				+
			
 
				+			/* failed */
			
 
				+			ret = 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&data->header_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static unsigned _submit_job_enforce_data_deps(job_t j, unsigned start_buffer_index)
			
 
				+{
			
 
				+	unsigned buf;
			
 
				+
			
 
				+	/* TODO compute an ordered list of the data */
			
 
				+
			
 
				+	unsigned nbuffers = j->task->cl->nbuffers;
			
 
				+	for (buf = start_buffer_index; buf < nbuffers; buf++)
			
 
				+	{
			
 
				+		if (attempt_to_submit_data_request_from_job(j, buf))
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* When a new task is submitted, we make sure that there cannot be codelets
			
 
				+   with concurrent data-access at the same time in the scheduling engine (eg.
			
 
				+   there can be 2 tasks reading a piece of data, but there cannot be one
			
 
				+   reading and another writing) */
			
 
				+unsigned submit_job_enforce_data_deps(job_t j)
			
 
				+{
			
 
				+	if (j->task->cl->nbuffers == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	return _submit_job_enforce_data_deps(j, 0);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void notify_data_dependencies(data_state *data)
			
 
				+{
			
 
				+	take_mutex(&data->header_lock);
			
 
				+
			
 
				+	data->refcnt--;
			
 
				+
			
 
				+	while (may_unlock_data_req_list_head(data))
			
 
				+	{
			
 
				+		/* unlock the head of the requester list */
			
 
				+		data_requester_t r = data_requester_list_pop_front(data->req_list);
			
 
				+
			
 
				+		data->refcnt++;
			
 
				+	
			
 
				+		release_mutex(&data->header_lock);
			
 
				+
			
 
				+		if (r->is_requested_by_codelet)
			
 
				+		{
			
 
				+			if (!unlock_one_requester(r))
			
 
				+				push_task(r->j);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_ASSERT(r->ready_data_callback);
			
 
				+
			
 
				+			/* execute the callback associated with the data requester */
			
 
				+			r->ready_data_callback(r->argcb);
			
 
				+		}
			
 
				+
			
 
				+		data_requester_delete(r);
			
 
				+		
			
 
				+		take_mutex(&data->header_lock);
			
 
				+	}
			
 
				+	
			
 
				+	release_mutex(&data->header_lock);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/src/core/dependencies/data-concurrency.h
+++ b/src/core/dependencies/data-concurrency.h
@@ -0,0 +1,33 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATA_CONCURRENCY_H__
			
 
				+#define __DATA_CONCURRENCY_H__
			
 
				+
			
 
				+#include <core/jobs.h>
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+
			
 
				+unsigned submit_job_enforce_data_deps(job_t j);
			
 
				+
			
 
				+void notify_data_dependencies(data_state *data);
			
 
				+
			
 
				+unsigned attempt_to_submit_data_request_from_apps(data_state *state, starpu_access_mode mode,
			
 
				+						void (*callback)(void *), void *argcb);
			
 
				+#endif
			
 
				+
			
 
				+#endif // __DATA_CONCURRENCY_H__
			
 
				+
			
--- a/src/core/dependencies/htable.c
+++ b/src/core/dependencies/htable.c
@@ -0,0 +1,174 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/dependencies/htable.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+void *htbl_search_tag(htbl_node_t *htbl, starpu_tag_t tag)
			
 
				+{
			
 
				+	unsigned currentbit;
			
 
				+	htbl_node_t *current_htbl = htbl;
			
 
				+
			
 
				+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
			
 
				+
			
 
				+	for(currentbit = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE)
			
 
				+	{
			
 
				+	
			
 
				+	//	printf("search : current bit = %d \n", currentbit);
			
 
				+		if (STARPU_UNLIKELY(current_htbl == NULL))
			
 
				+			return NULL;
			
 
				+
			
 
				+		/* 0000000000001111 
			
 
				+		 *     | currentbit
			
 
				+		 * 0000111100000000 = offloaded_mask
			
 
				+		 *         |last_currentbit
			
 
				+		 * */
			
 
				+
			
 
				+		unsigned last_currentbit = 
			
 
				+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
			
 
				+		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				+		unsigned current_index = 
			
 
				+			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				+
			
 
				+		current_htbl = current_htbl->children[current_index];
			
 
				+	}
			
 
				+
			
 
				+	return current_htbl;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * returns the previous value of the tag, or NULL else
			
 
				+ */
			
 
				+
			
 
				+void *htbl_insert_tag(htbl_node_t **htbl, starpu_tag_t tag, void *entry)
			
 
				+{
			
 
				+
			
 
				+	unsigned currentbit;
			
 
				+	htbl_node_t **current_htbl_ptr = htbl;
			
 
				+	htbl_node_t *previous_htbl_ptr = NULL;
			
 
				+
			
 
				+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
			
 
				+
			
 
				+	for(currentbit = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE)
			
 
				+	{
			
 
				+		if (*current_htbl_ptr == NULL) {
			
 
				+			/* TODO pad to change that 1 into 16 ? */
			
 
				+			*current_htbl_ptr = calloc(1, sizeof(htbl_node_t));
			
 
				+			assert(*current_htbl_ptr);
			
 
				+
			
 
				+			if (previous_htbl_ptr)
			
 
				+				previous_htbl_ptr->nentries++;
			
 
				+		}
			
 
				+
			
 
				+		/* 0000000000001111 
			
 
				+		 *     | currentbit
			
 
				+		 * 0000111100000000 = offloaded_mask
			
 
				+		 *         |last_currentbit
			
 
				+		 * */
			
 
				+
			
 
				+		unsigned last_currentbit = 
			
 
				+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
			
 
				+		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				+		unsigned current_index = 
			
 
				+			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				+
			
 
				+		previous_htbl_ptr = *current_htbl_ptr;
			
 
				+		current_htbl_ptr = 
			
 
				+			&((*current_htbl_ptr)->children[current_index]);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	/* current_htbl either contains NULL or a previous entry 
			
 
				+	 * we overwrite it anyway */
			
 
				+	void *old_entry = *current_htbl_ptr;
			
 
				+	*current_htbl_ptr = entry;
			
 
				+
			
 
				+	if (!old_entry)
			
 
				+		previous_htbl_ptr->nentries++;
			
 
				+
			
 
				+	return old_entry;
			
 
				+}
			
 
				+
			
 
				+/* returns the entry corresponding to the tag and remove it from the htbl */
			
 
				+void *htbl_remove_tag(htbl_node_t *htbl, starpu_tag_t tag)
			
 
				+{
			
 
				+	/* NB : if the entry is "NULL", we assume this means it is not present XXX */
			
 
				+	unsigned currentbit;
			
 
				+	htbl_node_t *current_htbl_ptr = htbl;
			
 
				+
			
 
				+	/* remember the path to the tag */
			
 
				+	htbl_node_t *path[(TAG_SIZE + HTBL_NODE_SIZE - 1)/(HTBL_NODE_SIZE)];
			
 
				+
			
 
				+	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				+	starpu_tag_t mask = (1<<HTBL_NODE_SIZE)-1;
			
 
				+	int level, maxlevel;
			
 
				+	unsigned tag_is_present = 1;
			
 
				+
			
 
				+	for(currentbit = 0, level = 0; currentbit < TAG_SIZE; currentbit+=HTBL_NODE_SIZE, level++)
			
 
				+	{
			
 
				+		path[level] = current_htbl_ptr;
			
 
				+
			
 
				+		if (STARPU_UNLIKELY(!current_htbl_ptr)) {
			
 
				+			tag_is_present = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		/* 0000000000001111 
			
 
				+		 *     | currentbit
			
 
				+		 * 0000111100000000 = offloaded_mask
			
 
				+		 *         |last_currentbit
			
 
				+		 * */
			
 
				+
			
 
				+		unsigned last_currentbit = 
			
 
				+			TAG_SIZE - (currentbit + HTBL_NODE_SIZE);
			
 
				+		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				+		unsigned current_index = 
			
 
				+			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				+		
			
 
				+		current_htbl_ptr = 
			
 
				+			current_htbl_ptr->children[current_index];
			
 
				+	}
			
 
				+
			
 
				+	maxlevel = level;
			
 
				+	if (STARPU_UNLIKELY(!current_htbl_ptr))
			
 
				+		tag_is_present = 0;
			
 
				+
			
 
				+	void *old_entry = current_htbl_ptr;
			
 
				+
			
 
				+	if (tag_is_present) {
			
 
				+		/* the tag was in the htbl, so we have to unroll the search 
			
 
				+ 		 * to remove possibly useless htbl (internal) nodes */
			
 
				+		for (level = maxlevel - 1; level >= 0; level--)
			
 
				+		{
			
 
				+			path[level]->nentries--;
			
 
				+
			
 
				+			/* TODO use likely statements ... */
			
 
				+
			
 
				+			/* in case we do not remove that node, we do decrease its parents
			
 
				+ 			 * number of entries */
			
 
				+			if (path[level]->nentries > 0)
			
 
				+				break;
			
 
				+
			
 
				+			/* we remove this node */
			
 
				+			free(path[level]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* we return the entry if there was one */
			
 
				+	return old_entry;
			
 
				+}
			
--- a/src/core/dependencies/htable.h
+++ b/src/core/dependencies/htable.h
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __HTABLE_H__
			
 
				+#define __HTABLE_H__
			
 
				+
			
 
				+/*
			
 
				+ *	Define a hierarchical table to do the tag matching
			
 
				+ */
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <core/dependencies/tags.h>
			
 
				+
			
 
				+#define HTBL_NODE_SIZE	16
			
 
				+
			
 
				+typedef struct _htbl_node_t {
			
 
				+	unsigned nentries;
			
 
				+	struct _htbl_node_t *children[1<<HTBL_NODE_SIZE];
			
 
				+} htbl_node_t;
			
 
				+
			
 
				+void *htbl_search_tag(htbl_node_t *htbl, starpu_tag_t tag);
			
 
				+void *htbl_insert_tag(htbl_node_t **htbl, starpu_tag_t tag, void *entry);
			
 
				+void *htbl_remove_tag(htbl_node_t *htbl, starpu_tag_t tag);
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -0,0 +1,266 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdarg.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <core/dependencies/tags.h>
			
 
				+#include <core/dependencies/htable.h>
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+#include <core/dependencies/data-concurrency.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static htbl_node_t *tag_htbl = NULL;
			
 
				+static starpu_mutex tag_mutex = {
			
 
				+	.taken = 0
			
 
				+};
			
 
				+
			
 
				+cg_t *create_cg(unsigned ntags, struct tag_s *tag)
			
 
				+{
			
 
				+	cg_t *cg;
			
 
				+
			
 
				+	cg = malloc(sizeof(cg_t));
			
 
				+	STARPU_ASSERT(cg);
			
 
				+	if (cg) {
			
 
				+		cg->ntags = ntags;
			
 
				+		cg->tag = tag;
			
 
				+	}
			
 
				+
			
 
				+	return cg;
			
 
				+}
			
 
				+
			
 
				+static struct tag_s *tag_init(starpu_tag_t id)
			
 
				+{
			
 
				+	struct tag_s *tag;
			
 
				+	tag = malloc(sizeof(struct tag_s));
			
 
				+	STARPU_ASSERT(tag);
			
 
				+
			
 
				+	tag->id = id;
			
 
				+	tag->state = UNASSIGNED;
			
 
				+	tag->nsuccs = 0;
			
 
				+
			
 
				+#ifdef DYNAMIC_DEPS_SIZE
			
 
				+	/* this is a small initial default value ... may be changed */
			
 
				+	tag->succ_list_size = 4;
			
 
				+	tag->succ = realloc(NULL, tag->succ_list_size*sizeof(struct _cg_t *));
			
 
				+#endif
			
 
				+
			
 
				+	init_mutex(&tag->lock);
			
 
				+
			
 
				+	tag->job = NULL;
			
 
				+
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+void starpu_tag_remove(starpu_tag_t id)
			
 
				+{
			
 
				+	struct tag_s *tag;
			
 
				+
			
 
				+	take_mutex(&tag_mutex);
			
 
				+	tag = htbl_remove_tag(tag_htbl, id);
			
 
				+	
			
 
				+#ifdef DYNAMIC_DEPS_SIZE
			
 
				+	if (tag)
			
 
				+		free(tag->succ);
			
 
				+#endif
			
 
				+
			
 
				+	release_mutex(&tag_mutex);
			
 
				+
			
 
				+	free(tag);
			
 
				+}
			
 
				+
			
 
				+struct tag_s *gettag_struct(starpu_tag_t id)
			
 
				+{
			
 
				+	take_mutex(&tag_mutex);
			
 
				+
			
 
				+	/* search if the tag is already declared or not */
			
 
				+	struct tag_s *tag;
			
 
				+	tag = htbl_search_tag(tag_htbl, id);
			
 
				+
			
 
				+	if (tag == NULL) {
			
 
				+		/* the tag does not exist yet : create an entry */
			
 
				+		tag = tag_init(id);
			
 
				+
			
 
				+		void *old;
			
 
				+		old = htbl_insert_tag(&tag_htbl, id, tag);
			
 
				+		/* there was no such tag before */
			
 
				+		STARPU_ASSERT(old == NULL);
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&tag_mutex);
			
 
				+
			
 
				+	return tag;
			
 
				+}
			
 
				+
			
 
				+void notify_cg(cg_t *cg)
			
 
				+{
			
 
				+
			
 
				+	STARPU_ASSERT(cg);
			
 
				+	unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
			
 
				+	if (ntags == 0) {
			
 
				+		/* the group is now completed */
			
 
				+		tag_set_ready(cg->tag);
			
 
				+		free(cg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void tag_add_succ(starpu_tag_t id, cg_t *cg)
			
 
				+{
			
 
				+	/* find out the associated structure */
			
 
				+	struct tag_s *tag = gettag_struct(id);
			
 
				+	STARPU_ASSERT(tag);
			
 
				+
			
 
				+	take_mutex(&tag->lock);
			
 
				+
			
 
				+	if (tag->state == DONE) {
			
 
				+		/* the tag was already completed sooner */
			
 
				+		notify_cg(cg);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* where should that cg should be put in the array ? */
			
 
				+		unsigned index = STARPU_ATOMIC_ADD(&tag->nsuccs, 1) - 1;
			
 
				+
			
 
				+#ifdef DYNAMIC_DEPS_SIZE
			
 
				+		if (index >= tag->succ_list_size)
			
 
				+		{
			
 
				+			/* the successor list is too small */
			
 
				+			tag->succ_list_size *= 2;
			
 
				+
			
 
				+			/* NB: this is thread safe as the tag->lock is taken */
			
 
				+			tag->succ = realloc(tag->succ, 
			
 
				+				tag->succ_list_size*sizeof(struct _cg_t *));
			
 
				+		}
			
 
				+#else
			
 
				+		STARPU_ASSERT(index < NMAXDEPS);
			
 
				+#endif
			
 
				+
			
 
				+		tag->succ[index] = cg;
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&tag->lock);
			
 
				+}
			
 
				+
			
 
				+void notify_dependencies(struct job_s *j)
			
 
				+{
			
 
				+	struct tag_s *tag;
			
 
				+	unsigned nsuccs;
			
 
				+	unsigned succ;
			
 
				+
			
 
				+	STARPU_ASSERT(j);
			
 
				+	
			
 
				+	if (j->task->use_tag) {
			
 
				+		/* in case there are dependencies, wake up the proper tasks */
			
 
				+		tag = j->tag;
			
 
				+
			
 
				+		tag->state = DONE;
			
 
				+		TRACE_TASK_DONE(tag->id);
			
 
				+
			
 
				+		nsuccs = tag->nsuccs;
			
 
				+		for (succ = 0; succ < nsuccs; succ++)
			
 
				+		{
			
 
				+			notify_cg(tag->succ[succ]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void tag_declare(starpu_tag_t id, struct job_s *job)
			
 
				+{
			
 
				+	TRACE_CODELET_TAG(id, job);
			
 
				+	job->task->use_tag = 1;
			
 
				+	
			
 
				+	struct tag_s *tag= gettag_struct(id);
			
 
				+	tag->job = job;
			
 
				+	
			
 
				+	job->tag = tag;
			
 
				+}
			
 
				+
			
 
				+void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* create the associated completion group */
			
 
				+	struct tag_s *tag_child = gettag_struct(id);
			
 
				+	cg_t *cg = create_cg(ndeps, tag_child);
			
 
				+	
			
 
				+	tag_child->state = BLOCKED;
			
 
				+	
			
 
				+	STARPU_ASSERT(ndeps != 0);
			
 
				+	
			
 
				+	for (i = 0; i < ndeps; i++)
			
 
				+	{
			
 
				+		starpu_tag_t dep_id = array[i];
			
 
				+		
			
 
				+		/* id depends on dep_id
			
 
				+		 * so cg should be among dep_id's successors*/
			
 
				+		TRACE_CODELET_TAG_DEPS(id, dep_id);
			
 
				+		tag_add_succ(dep_id, cg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	
			
 
				+	/* create the associated completion group */
			
 
				+	struct tag_s *tag_child = gettag_struct(id);
			
 
				+	cg_t *cg = create_cg(ndeps, tag_child);
			
 
				+	
			
 
				+	tag_child->state = BLOCKED;
			
 
				+	
			
 
				+	STARPU_ASSERT(ndeps != 0);
			
 
				+	
			
 
				+	va_list pa;
			
 
				+	va_start(pa, ndeps);
			
 
				+	for (i = 0; i < ndeps; i++)
			
 
				+	{
			
 
				+		starpu_tag_t dep_id;
			
 
				+		dep_id = va_arg(pa, starpu_tag_t);
			
 
				+		
			
 
				+		/* id depends on dep_id
			
 
				+		 * so cg should be among dep_id's successors*/
			
 
				+		TRACE_CODELET_TAG_DEPS(id, dep_id);
			
 
				+		tag_add_succ(dep_id, cg);
			
 
				+	}
			
 
				+	va_end(pa);
			
 
				+}
			
 
				+
			
 
				+void tag_set_ready(struct tag_s *tag)
			
 
				+{
			
 
				+	/* mark this tag as ready to run */
			
 
				+	tag->state = READY;
			
 
				+	/* declare it to the scheduler ! */
			
 
				+	struct job_s *j = tag->job;
			
 
				+
			
 
				+	/* perhaps the corresponding task was not declared yet */
			
 
				+	if (!j)
			
 
				+		return;
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	/* enforce data dependencies */
			
 
				+	if (submit_job_enforce_data_deps(j))
			
 
				+		return;
			
 
				+#endif
			
 
				+
			
 
				+	push_task(j);
			
 
				+}
			
 
				+
			
 
				+/* This function is called when a new task is submitted to StarPU 
			
 
				+ * it returns 1 if the task deps are not fulfilled, 0 otherwise */
			
 
				+unsigned submit_job_enforce_task_deps(job_t j)
			
 
				+{
			
 
				+	struct tag_s *tag = j->tag;
			
 
				+	return (tag->state == BLOCKED);
			
 
				+}
			
--- a/src/core/dependencies/tags.h
+++ b/src/core/dependencies/tags.h
@@ -0,0 +1,78 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __TAGS_H__
			
 
				+#define __TAGS_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <starpu-mutex.h>
			
 
				+#include <core/jobs.h>
			
 
				+
			
 
				+/* we do not necessarily want to allocate room for 256 dependencies, but we
			
 
				+   want to handle the few situation where there are a lot of dependencies as
			
 
				+   well */
			
 
				+#define DYNAMIC_DEPS_SIZE	1
			
 
				+
			
 
				+/* randomly choosen ! */
			
 
				+#ifndef DYNAMIC_DEPS_SIZE
			
 
				+#define NMAXDEPS	256
			
 
				+#endif
			
 
				+
			
 
				+#define TAG_SIZE        (sizeof(starpu_tag_t)*8)
			
 
				+
			
 
				+typedef enum {
			
 
				+	UNASSIGNED,
			
 
				+	DONE,
			
 
				+	READY,
			
 
				+	SCHEDULED,
			
 
				+	BLOCKED
			
 
				+} tag_state;
			
 
				+
			
 
				+struct job_s;
			
 
				+
			
 
				+struct tag_s {
			
 
				+	starpu_mutex lock; /* do we really need that ? */
			
 
				+	starpu_tag_t id; /* an identifier for the task */
			
 
				+	tag_state state;
			
 
				+	unsigned nsuccs; /* how many successors ? */
			
 
				+#ifdef DYNAMIC_DEPS_SIZE
			
 
				+	unsigned succ_list_size;
			
 
				+	struct _cg_t **succ;
			
 
				+#else
			
 
				+	struct _cg_t *succ[NMAXDEPS];
			
 
				+#endif
			
 
				+	struct job_s *job; /* which job is associated to the tag if any ? */
			
 
				+};
			
 
				+
			
 
				+typedef struct _cg_t {
			
 
				+	unsigned ntags; /* number of remaining tags */
			
 
				+	struct tag_s *tag; /* which tags depends on that cg ?  */
			
 
				+} cg_t;
			
 
				+
			
 
				+void notify_cg(cg_t *cg);
			
 
				+void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
			
 
				+
			
 
				+cg_t *create_cg(unsigned ntags, struct tag_s *tag);
			
 
				+struct tag_s *get_tag_struct(starpu_tag_t id);
			
 
				+void tag_add_succ(starpu_tag_t id, cg_t *cg);
			
 
				+
			
 
				+void notify_dependencies(struct job_s *j);
			
 
				+void tag_declare(starpu_tag_t id, struct job_s *job);
			
 
				+void tag_set_ready(struct tag_s *tag);
			
 
				+
			
 
				+unsigned submit_job_enforce_task_deps(struct job_s *j);
			
 
				+
			
 
				+#endif // __TAGS_H__
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -0,0 +1,184 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <core/dependencies/data-concurrency.h>
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+size_t job_get_data_size(job_t j)
			
 
				+{
			
 
				+	size_t size = 0;
			
 
				+
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	unsigned nbuffers = task->cl->nbuffers;
			
 
				+
			
 
				+	unsigned buffer;
			
 
				+	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				+	{
			
 
				+		data_state *state = task->buffers[buffer].state;
			
 
				+		size += state->ops->get_size(state);
			
 
				+	}
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/* create an internal job_t structure to encapsulate the task */
			
 
				+job_t __attribute__((malloc)) job_create(struct starpu_task *task)
			
 
				+{
			
 
				+	job_t job;
			
 
				+
			
 
				+	job = job_new();
			
 
				+
			
 
				+	job->task = task;
			
 
				+
			
 
				+	job->predicted = 0.0;
			
 
				+	job->footprint_is_computed = 0;
			
 
				+	job->terminated = 0;
			
 
				+
			
 
				+	if (task->synchronous)
			
 
				+		sem_init(&job->sync_sem, 0, 0);
			
 
				+
			
 
				+	if (task->use_tag)
			
 
				+		tag_declare(task->tag_id, job);
			
 
				+
			
 
				+	return job;
			
 
				+}
			
 
				+
			
 
				+struct starpu_task * __attribute__((malloc)) starpu_task_create(void)
			
 
				+{
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	task = calloc(1, sizeof(struct starpu_task));
			
 
				+	STARPU_ASSERT(task);
			
 
				+
			
 
				+	task->priority = DEFAULT_PRIO;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+void handle_job_termination(job_t j)
			
 
				+{
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	if (STARPU_UNLIKELY(j->terminated))
			
 
				+		fprintf(stderr, "OOPS ... job %p was already terminated !!\n", j);
			
 
				+
			
 
				+	j->terminated = 1;
			
 
				+
			
 
				+	/* in case there are dependencies, wake up the proper tasks */
			
 
				+	notify_dependencies(j);
			
 
				+
			
 
				+	/* the callback is executed after the dependencies so that we may remove the tag 
			
 
				+ 	 * of the task itself */
			
 
				+	if (task->callback_func)
			
 
				+	{
			
 
				+		TRACE_START_CALLBACK(j);
			
 
				+		task->callback_func(task->callback_arg);
			
 
				+		TRACE_END_CALLBACK(j);
			
 
				+	}
			
 
				+
			
 
				+	if (task->synchronous)
			
 
				+	{
			
 
				+		if (sem_post(&j->sync_sem))
			
 
				+			perror("sem_post");
			
 
				+
			
 
				+		/* as this is a synchronous task, we do not delete the job 
			
 
				+		   structure which contains the j->sync_sem: we only liberate
			
 
				+		   it once the semaphore is destroyed */
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		job_delete(j);
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void block_sync_task(job_t j)
			
 
				+{
			
 
				+	{
			
 
				+		sem_wait(&j->sync_sem);
			
 
				+		sem_destroy(&j->sync_sem);
			
 
				+
			
 
				+		/* as this is a synchronous task, the liberation of the job
			
 
				+		   structure was deferred */
			
 
				+		job_delete(j);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* application should submit new tasks to StarPU through this function */
			
 
				+int starpu_submit_task(struct starpu_task *task)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned is_sync = task->synchronous;
			
 
				+
			
 
				+	STARPU_ASSERT(task);
			
 
				+
			
 
				+	if (!worker_exists(task->cl->where))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	/* internally, StarPU manipulates a job_t which is a wrapper around a
			
 
				+ 	* task structure */
			
 
				+	job_t j = job_create(task);
			
 
				+
			
 
				+	/* enfore task dependencies */
			
 
				+	if (task->use_tag)
			
 
				+	{
			
 
				+		if (submit_job_enforce_task_deps(j))
			
 
				+		{
			
 
				+			if (is_sync)
			
 
				+				block_sync_task(j);
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	/* enforce data dependencies */
			
 
				+	if (submit_job_enforce_data_deps(j))
			
 
				+	{
			
 
				+		if (is_sync)
			
 
				+			block_sync_task(j);
			
 
				+		return 0;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	ret = push_task(j);
			
 
				+
			
 
				+	if (is_sync)
			
 
				+		block_sync_task(j);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+//int submit_prio_job(job_t j)
			
 
				+//{
			
 
				+//	j->priority = MAX_PRIO;
			
 
				+//	
			
 
				+//	return submit_job(j);
			
 
				+//}
			
 
				+
			
 
				+/* This function is supplied for convenience only, it is equivalent to setting
			
 
				+ * the proper flag and submitting the task with submit_task.
			
 
				+ * Note that this call is blocking, and will not make StarPU progress,
			
 
				+ * so it must only be called from the programmer thread, not by StarPU.
			
 
				+ * NB: This also means that it cannot be submitted within a callback ! */
			
 
				+int submit_sync_task(struct starpu_task *task)
			
 
				+{
			
 
				+	task->synchronous = 1;
			
 
				+
			
 
				+	return starpu_submit_task(task);
			
 
				+}
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -0,0 +1,80 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __JOBS_H__
			
 
				+#define __JOBS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <semaphore.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdint.h>
			
 
				+#include <unistd.h>
			
 
				+#include <string.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <pthread.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/timing.h>
			
 
				+#include <common/list.h>
			
 
				+#include <common/fxt.h>
			
 
				+
			
 
				+#include <core/dependencies/tags.h>
			
 
				+
			
 
				+#include <datawizard/datawizard.h>
			
 
				+
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+/* codelet function */
			
 
				+typedef void (*cl_func)(starpu_data_interface_t *, void *);
			
 
				+typedef void (*callback)(void *);
			
 
				+
			
 
				+#define CORE_MAY_PERFORM(j)	((j)->task->cl->where & CORE)
			
 
				+#define CUDA_MAY_PERFORM(j)     ((j)->task->cl->where & CUDA)
			
 
				+#define CUBLAS_MAY_PERFORM(j)   ((j)->task->cl->where & CUBLAS)
			
 
				+#define SPU_MAY_PERFORM(j)	((j)->task->cl->where & SPU)
			
 
				+#define GORDON_MAY_PERFORM(j)	((j)->task->cl->where & GORDON)
			
 
				+
			
 
				+/* a job is the internal representation of a task */
			
 
				+LIST_TYPE(job,
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	sem_t sync_sem;
			
 
				+
			
 
				+	struct tag_s *tag;
			
 
				+
			
 
				+	double predicted;
			
 
				+	double penality;
			
 
				+
			
 
				+	unsigned footprint_is_computed;
			
 
				+	uint32_t footprint;
			
 
				+
			
 
				+	unsigned terminated;
			
 
				+);
			
 
				+
			
 
				+//#warning this must not be exported anymore ... 
			
 
				+//job_t job_create(struct starpu_task *task);
			
 
				+void handle_job_termination(job_t j);
			
 
				+size_t job_get_data_size(job_t j);
			
 
				+
			
 
				+//int submit_job(job_t j);
			
 
				+//int submit_prio_job(job_t j);
			
 
				+//int submit_job_sync(job_t j);
			
 
				+
			
 
				+#endif // __JOBS_H__
			
--- a/src/core/mechanisms/TODO
+++ b/src/core/mechanisms/TODO
@@ -0,0 +1,21 @@
 
				+queue design :
			
 
				+	- create_central_jobq		-> trivial single list
			
 
				+	- create_per_accelerator_jobq	-> one core = one queue (cilk-like)
			
 
				+	- create_hierarchical_jobq	-> marcel-like
			
 
				+
			
 
				+remarks:
			
 
				+	- a queue may be a set of queue (eg. for priority queues)
			
 
				+	- queues may have a limited size so that an extra central queue 
			
 
				+	  could be needed in that case ...
			
 
				+
			
 
				+mecanisms :
			
 
				+	- steal_job_from_queue
			
 
				+	- push_job_onto_queue
			
 
				+	- equilibrate_queues	-> balances 2 queues 
			
 
				+	- reorder a queue
			
 
				+
			
 
				+policy role :
			
 
				+	- implementing the push_job/fetch_job functions
			
 
				+	- creating the actual queues 
			
 
				+	- progression thread that regularly recompute a better schedule ?
			
 
				+		- rather do that in the context of a task submission
			
--- a/src/core/mechanisms/deque_queues.c
+++ b/src/core/mechanisms/deque_queues.c
@@ -0,0 +1,209 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <core/mechanisms/deque_queues.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+/* keep track of the total number of jobs to be scheduled to avoid infinite 
			
 
				+ * polling when there are really few jobs in the overall queue */
			
 
				+static unsigned total_number_of_jobs;
			
 
				+
			
 
				+static pthread_cond_t *sched_cond;
			
 
				+static pthread_mutex_t *sched_mutex;
			
 
				+
			
 
				+void init_deque_queues_mechanisms(void)
			
 
				+{
			
 
				+	total_number_of_jobs = 0;
			
 
				+
			
 
				+	struct sched_policy_s *sched = get_sched_policy();
			
 
				+
			
 
				+	/* to access them more easily, we keep their address in local variables */
			
 
				+	sched_cond = &sched->sched_activity_cond;
			
 
				+	sched_mutex = &sched->sched_activity_mutex;
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *create_deque(void)
			
 
				+{
			
 
				+	struct jobq_s *jobq;
			
 
				+	jobq = malloc(sizeof(struct jobq_s));
			
 
				+
			
 
				+	pthread_mutex_init(&jobq->activity_mutex, NULL);
			
 
				+	pthread_cond_init(&jobq->activity_cond, NULL);
			
 
				+
			
 
				+	struct deque_jobq_s *deque;
			
 
				+	deque = malloc(sizeof(struct deque_jobq_s));
			
 
				+
			
 
				+	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				+	deque->jobq = job_list_new();
			
 
				+	deque->njobs = 0;
			
 
				+	deque->nprocessed = 0;
			
 
				+
			
 
				+	deque->exp_start = timing_now()/1000000;
			
 
				+	deque->exp_len = 0.0;
			
 
				+	deque->exp_end = deque->exp_start;
			
 
				+
			
 
				+	jobq->queue = deque;
			
 
				+
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+unsigned get_total_njobs_deques(void)
			
 
				+{
			
 
				+	return total_number_of_jobs;
			
 
				+}
			
 
				+
			
 
				+unsigned get_deque_njobs(struct jobq_s *q)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+
			
 
				+	struct deque_jobq_s *deque_queue = q->queue;
			
 
				+
			
 
				+	return deque_queue->njobs;
			
 
				+}
			
 
				+
			
 
				+unsigned get_deque_nprocessed(struct jobq_s *q)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+
			
 
				+	struct deque_jobq_s *deque_queue = q->queue;
			
 
				+
			
 
				+	return deque_queue->nprocessed;
			
 
				+}
			
 
				+
			
 
				+int deque_push_prio_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	return deque_push_task(q, task);
			
 
				+}
			
 
				+
			
 
				+int deque_push_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct deque_jobq_s *deque_queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	total_number_of_jobs++;
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(task, 0);
			
 
				+	job_list_push_front(deque_queue->jobq, task);
			
 
				+	deque_queue->njobs++;
			
 
				+	deque_queue->nprocessed++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+job_t deque_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct deque_jobq_s *deque_queue = q->queue;
			
 
				+
			
 
				+	/* block until some task is available in that queue */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (deque_queue->njobs == 0)
			
 
				+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
			
 
				+
			
 
				+	if (deque_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_front(deque_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		deque_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+job_t deque_non_blocking_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct deque_jobq_s *deque_queue = q->queue;
			
 
				+
			
 
				+	/* block until some task is available in that queue */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (deque_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_front(deque_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		deque_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+job_t deque_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j;
			
 
				+
			
 
				+	j = deque_non_blocking_pop_task(q);
			
 
				+
			
 
				+/* XXX */
			
 
				+#if 0
			
 
				+	if (!j) {
			
 
				+		/* there is no job at all in the entire system : go to sleep ! */
			
 
				+
			
 
				+		/* that wait is not an absolute sign that there is some work 
			
 
				+		 * if there is some, the thread should be awoken, but if there is none 
			
 
				+		 * at the moment it is awoken, it may simply poll a limited number of 
			
 
				+		 * times and just get back to sleep */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+
			
 
				+		if (total_number_of_jobs == 0)
			
 
				+			pthread_cond_wait(sched_cond, sched_mutex);
			
 
				+
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return j;
			
 
				+}
			
--- a/src/core/mechanisms/deque_queues.h
+++ b/src/core/mechanisms/deque_queues.h
@@ -0,0 +1,54 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DEQUE_QUEUES_H__
			
 
				+#define __DEQUE_QUEUES_H__
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+
			
 
				+struct deque_jobq_s {
			
 
				+	/* the actual list */
			
 
				+	job_list_t jobq;
			
 
				+
			
 
				+	/* the number of tasks currently in the queue */
			
 
				+	unsigned njobs;
			
 
				+
			
 
				+	/* the number of tasks that were processed */
			
 
				+	unsigned nprocessed;
			
 
				+
			
 
				+	/* only meaningful if the queue is only used by a single worker */
			
 
				+	double exp_start;
			
 
				+	double exp_end;
			
 
				+	double exp_len;
			
 
				+};
			
 
				+
			
 
				+struct jobq_s *create_deque(void);
			
 
				+
			
 
				+int deque_push_task(struct jobq_s *q, job_t task);
			
 
				+int deque_push_prio_task(struct jobq_s *q, job_t task);
			
 
				+
			
 
				+job_t deque_pop_task(struct jobq_s *q);
			
 
				+job_t deque_non_blocking_pop_task(struct jobq_s *q);
			
 
				+job_t deque_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
			
 
				+
			
 
				+void init_deque_queues_mechanisms(void);
			
 
				+
			
 
				+
			
 
				+unsigned get_deque_njobs(struct jobq_s *q);
			
 
				+unsigned get_deque_nprocessed(struct jobq_s *q);
			
 
				+
			
 
				+
			
 
				+#endif // __DEQUE_QUEUES_H__
			
--- a/src/core/mechanisms/fifo_queues.c
+++ b/src/core/mechanisms/fifo_queues.c
@@ -0,0 +1,245 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+/* keep track of the total number of jobs to be scheduled to avoid infinite 
			
 
				+ * polling when there are really few jobs in the overall queue */
			
 
				+static unsigned total_number_of_jobs;
			
 
				+
			
 
				+static pthread_cond_t *sched_cond;
			
 
				+static pthread_mutex_t *sched_mutex;
			
 
				+
			
 
				+void init_fifo_queues_mechanisms(void)
			
 
				+{
			
 
				+	total_number_of_jobs = 0;
			
 
				+
			
 
				+	struct sched_policy_s *sched = get_sched_policy();
			
 
				+
			
 
				+	/* to access them more easily, we keep their address in local variables */
			
 
				+	sched_cond = &sched->sched_activity_cond;
			
 
				+	sched_mutex = &sched->sched_activity_mutex;
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *create_fifo(void)
			
 
				+{
			
 
				+	struct jobq_s *jobq;
			
 
				+	jobq = malloc(sizeof(struct jobq_s));
			
 
				+
			
 
				+	pthread_mutex_init(&jobq->activity_mutex, NULL);
			
 
				+	pthread_cond_init(&jobq->activity_cond, NULL);
			
 
				+
			
 
				+	struct fifo_jobq_s *fifo;
			
 
				+	fifo = malloc(sizeof(struct fifo_jobq_s));
			
 
				+
			
 
				+	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				+	fifo->jobq = job_list_new();
			
 
				+	fifo->njobs = 0;
			
 
				+	fifo->nprocessed = 0;
			
 
				+
			
 
				+	fifo->exp_start = timing_now()/1000000;
			
 
				+	fifo->exp_len = 0.0;
			
 
				+	fifo->exp_end = fifo->exp_start;
			
 
				+
			
 
				+	jobq->queue = fifo;
			
 
				+
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+int fifo_push_prio_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+#ifndef NO_PRIO
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct fifo_jobq_s *fifo_queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	total_number_of_jobs++;
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+	
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(task, 0);
			
 
				+	job_list_push_back(fifo_queue->jobq, task);
			
 
				+	fifo_queue->njobs++;
			
 
				+	fifo_queue->nprocessed++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+#else
			
 
				+	return fifo_push_task(q, task);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+int fifo_push_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct fifo_jobq_s *fifo_queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	total_number_of_jobs++;
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+	
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(task, 0);
			
 
				+	job_list_push_front(fifo_queue->jobq, task);
			
 
				+	fifo_queue->njobs++;
			
 
				+	fifo_queue->nprocessed++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+job_t fifo_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct fifo_jobq_s *fifo_queue = q->queue;
			
 
				+
			
 
				+	/* block until some event happens */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (fifo_queue->njobs == 0)
			
 
				+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
			
 
				+
			
 
				+	if (fifo_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_back(fifo_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		fifo_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+struct job_list_s * fifo_pop_every_task(struct jobq_s *q)
			
 
				+{
			
 
				+	struct job_list_s *list;
			
 
				+	unsigned size;
			
 
				+	
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct fifo_jobq_s *fifo_queue = q->queue;
			
 
				+
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	size = fifo_queue->njobs;
			
 
				+
			
 
				+	if (size == 0) {
			
 
				+		list = NULL;
			
 
				+	}
			
 
				+	else {
			
 
				+		/* directly use the existing list of jobs */
			
 
				+		list = fifo_queue->jobq;
			
 
				+
			
 
				+	//	fprintf(stderr, "DEBUG, fifo_pop_every_task promised %d got %d\n",  size, job_list_size(list));
			
 
				+		
			
 
				+		/* the FIFO is now a new empty list */
			
 
				+		fifo_queue->jobq = job_list_new();
			
 
				+		fifo_queue->njobs = 0;
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs -= size;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return list;
			
 
				+}
			
 
				+
			
 
				+/* for work stealing, typically */
			
 
				+job_t fifo_non_blocking_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct fifo_jobq_s *fifo_queue = q->queue;
			
 
				+
			
 
				+	/* block until some event happens */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (fifo_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_back(fifo_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		fifo_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+job_t fifo_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j;
			
 
				+
			
 
				+	j = fifo_non_blocking_pop_task(q);
			
 
				+
			
 
				+	if (!j) {
			
 
				+		/* there is no job at all in the entire system : go to sleep ! */
			
 
				+
			
 
				+		/* that wait is not an absolute sign that there is some work 
			
 
				+		 * if there is some, the thread should be awoken, but if there is none 
			
 
				+		 * at the moment it is awoken, it may simply poll a limited number of 
			
 
				+		 * times and just get back to sleep */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+
			
 
				+		if (total_number_of_jobs == 0)
			
 
				+			pthread_cond_wait(sched_cond, sched_mutex);
			
 
				+
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+
			
 
				+	return j;
			
 
				+}
			
--- a/src/core/mechanisms/fifo_queues.h
+++ b/src/core/mechanisms/fifo_queues.h
@@ -0,0 +1,50 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __FIFO_QUEUES_H__
			
 
				+#define __FIFO_QUEUES_H__
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+
			
 
				+struct fifo_jobq_s {
			
 
				+	/* the actual list */
			
 
				+	job_list_t jobq;
			
 
				+
			
 
				+	/* the number of tasks currently in the queue */
			
 
				+	unsigned njobs;
			
 
				+
			
 
				+	/* the number of tasks that were processed */
			
 
				+	unsigned nprocessed;
			
 
				+
			
 
				+	/* only meaningful if the queue is only used by a single worker */
			
 
				+	double exp_start;
			
 
				+	double exp_end;
			
 
				+	double exp_len;
			
 
				+};
			
 
				+
			
 
				+struct jobq_s *create_fifo(void);
			
 
				+
			
 
				+int fifo_push_task(struct jobq_s *q, job_t task);
			
 
				+int fifo_push_prio_task(struct jobq_s *q, job_t task);
			
 
				+
			
 
				+job_t fifo_pop_task(struct jobq_s *q);
			
 
				+struct job_list_s * fifo_pop_every_task(struct jobq_s *q);
			
 
				+job_t fifo_non_blocking_pop_task(struct jobq_s *q);
			
 
				+job_t fifo_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
			
 
				+
			
 
				+void init_fifo_queues_mechanisms(void);
			
 
				+
			
 
				+#endif // __FIFO_QUEUES_H__
			
--- a/src/core/mechanisms/priority_queues.c
+++ b/src/core/mechanisms/priority_queues.c
@@ -0,0 +1,121 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/mechanisms/priority_queues.h>
			
 
				+
			
 
				+/*
			
 
				+ * Centralized queue with priorities 
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/* keep track of the total number of jobs to be scheduled to avoid infinite 
			
 
				+ * polling when there are really few jobs in the overall queue */
			
 
				+static pthread_cond_t *sched_cond;
			
 
				+static pthread_mutex_t *sched_mutex;
			
 
				+
			
 
				+void init_priority_queues_mechanisms(void)
			
 
				+{
			
 
				+	struct sched_policy_s *sched = get_sched_policy();
			
 
				+
			
 
				+	/* to access them more easily, we keep their address in local variables */
			
 
				+	sched_cond = &sched->sched_activity_cond;
			
 
				+	sched_mutex = &sched->sched_activity_mutex;
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *create_priority_jobq(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = malloc(sizeof(struct jobq_s));
			
 
				+
			
 
				+	struct priority_jobq_s *central_queue;
			
 
				+	
			
 
				+	central_queue = malloc(sizeof(struct priority_jobq_s));
			
 
				+	q->queue = central_queue;
			
 
				+
			
 
				+	pthread_mutex_init(&q->activity_mutex, NULL);
			
 
				+	pthread_cond_init(&q->activity_cond, NULL);
			
 
				+
			
 
				+	central_queue->total_njobs = 0;
			
 
				+
			
 
				+	unsigned prio;
			
 
				+	for (prio = 0; prio < NPRIO_LEVELS; prio++)
			
 
				+	{
			
 
				+		central_queue->jobq[prio] = job_list_new();
			
 
				+		central_queue->njobs[prio] = 0;
			
 
				+	}
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+int priority_push_task(struct jobq_s *q, job_t j)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct priority_jobq_s *queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(j, 1);
			
 
				+	
			
 
				+	unsigned priolevel = j->task->priority - MIN_PRIO;
			
 
				+
			
 
				+	job_list_push_front(queue->jobq[priolevel], j);
			
 
				+	queue->njobs[priolevel]++;
			
 
				+	queue->total_njobs++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+job_t priority_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct priority_jobq_s *queue = q->queue;
			
 
				+
			
 
				+	/* block until some event happens */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (queue->total_njobs == 0)
			
 
				+		 pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
			
 
				+
			
 
				+	if (queue->total_njobs > 0)
			
 
				+	{
			
 
				+		unsigned priolevel = NPRIO_LEVELS - 1;
			
 
				+		do {
			
 
				+			if (queue->njobs[priolevel] > 0) {
			
 
				+				/* there is some task that we can grab */
			
 
				+				j = job_list_pop_back(queue->jobq[priolevel]);
			
 
				+				queue->njobs[priolevel]--;
			
 
				+				queue->total_njobs--;
			
 
				+				TRACE_JOB_POP(j, 0);
			
 
				+			}
			
 
				+		} while (!j && priolevel-- > 0);
			
 
				+	}
			
 
				+
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
--- a/src/core/mechanisms/priority_queues.h
+++ b/src/core/mechanisms/priority_queues.h
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PRIORITY_QUEUES_H__
			
 
				+#define __PRIORITY_QUEUES_H__
			
 
				+
			
 
				+#define MIN_PRIO	(-4)
			
 
				+#define MAX_PRIO	5
			
 
				+
			
 
				+#define NPRIO_LEVELS	((MAX_PRIO) - (MIN_PRIO) + 1)
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+
			
 
				+struct priority_jobq_s {
			
 
				+	/* the actual lists 
			
 
				+	 *	jobq[p] is for priority [p - MIN_PRIO] */
			
 
				+	job_list_t jobq[NPRIO_LEVELS];
			
 
				+	unsigned njobs[NPRIO_LEVELS];
			
 
				+
			
 
				+	unsigned total_njobs;
			
 
				+};
			
 
				+
			
 
				+struct jobq_s *create_priority_jobq(void);
			
 
				+void init_priority_queues_mechanisms(void);
			
 
				+
			
 
				+int priority_push_task(struct jobq_s *q, job_t task);
			
 
				+
			
 
				+job_t priority_pop_task(struct jobq_s *q);
			
 
				+
			
 
				+#endif // __PRIORITY_QUEUES_H__
			
--- a/src/core/mechanisms/queues.c
+++ b/src/core/mechanisms/queues.c
@@ -0,0 +1,80 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "queues.h"
			
 
				+
			
 
				+/*
			
 
				+ * There can be various queue designs
			
 
				+ * 	- trivial single list
			
 
				+ * 	- cilk-like 
			
 
				+ * 	- hierarchical (marcel-like)
			
 
				+ */
			
 
				+
			
 
				+void setup_queues(void (*init_queue_design)(void),
			
 
				+		  struct jobq_s *(*func_init_queue)(void), 
			
 
				+		  struct machine_config_s *config) 
			
 
				+{
			
 
				+	unsigned worker;
			
 
				+
			
 
				+	init_queue_design();
			
 
				+
			
 
				+	for (worker = 0; worker < config->nworkers; worker++)
			
 
				+	{
			
 
				+		struct  worker_s *workerarg = &config->workers[worker];
			
 
				+		
			
 
				+		workerarg->jobq = func_init_queue();
			
 
				+
			
 
				+		/* warning : in case there are multiple workers on the same
			
 
				+                   queue, we overwrite this value so that it is meaningless
			
 
				+		 */
			
 
				+		workerarg->jobq->arch = workerarg->perf_arch;
			
 
				+
			
 
				+		switch (workerarg->arch) {
			
 
				+			case CORE_WORKER:
			
 
				+				workerarg->jobq->who |= CORE;
			
 
				+				workerarg->jobq->alpha = CORE_ALPHA;
			
 
				+				break;
			
 
				+			case CUDA_WORKER:
			
 
				+				workerarg->jobq->who |= CUDA|CUBLAS;
			
 
				+				workerarg->jobq->alpha = CUDA_ALPHA;
			
 
				+				break;
			
 
				+			case GORDON_WORKER:
			
 
				+				workerarg->jobq->who |= GORDON;
			
 
				+				workerarg->jobq->alpha = GORDON_ALPHA;
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+		}
			
 
				+		
			
 
				+		memory_node_attach_queue(workerarg->jobq, workerarg->memory_node);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* this may return NULL for an "anonymous thread" */
			
 
				+struct jobq_s *get_local_queue(void)
			
 
				+{
			
 
				+	struct sched_policy_s *policy = get_sched_policy();
			
 
				+
			
 
				+	return pthread_getspecific(policy->local_queue_key);
			
 
				+}
			
 
				+
			
 
				+/* XXX how to retrieve policy ? that may be given in the machine config ? */
			
 
				+void set_local_queue(struct jobq_s *jobq)
			
 
				+{
			
 
				+	struct sched_policy_s *policy = get_sched_policy();
			
 
				+
			
 
				+	pthread_setspecific(policy->local_queue_key, jobq);
			
 
				+}
			
--- a/src/core/mechanisms/queues.h
+++ b/src/core/mechanisms/queues.h
@@ -0,0 +1,72 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __QUEUES_H__
			
 
				+#define __QUEUES_H__
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+
			
 
				+enum starpu_perf_archtype;
			
 
				+
			
 
				+struct jobq_s {
			
 
				+	/* a pointer to some queue structure */
			
 
				+	void *queue; 
			
 
				+
			
 
				+	/* some methods to manipulate the previous queue */
			
 
				+	int (*push_task)(struct jobq_s *, job_t);
			
 
				+	int (*push_prio_task)(struct jobq_s *, job_t);
			
 
				+	struct job_s* (*pop_task)(struct jobq_s *);
			
 
				+
			
 
				+	/* returns the number of tasks that were retrieved 
			
 
				+ 	 * the function is reponsible for allocating the output but the driver
			
 
				+ 	 * has to free it 
			
 
				+ 	 *
			
 
				+ 	 * NB : this function is non blocking
			
 
				+ 	 * */
			
 
				+	struct job_list_s *(*pop_every_task)(struct jobq_s *);
			
 
				+
			
 
				+	/* what are the driver that may pop job from that queue ? */
			
 
				+	uint32_t who;
			
 
				+
			
 
				+	/* this is only relevant if there is a single worker per queue */
			
 
				+	uint32_t memory_node;
			
 
				+	enum starpu_perf_archtype arch;
			
 
				+	float alpha;
			
 
				+
			
 
				+	/* for performance analysis purpose */
			
 
				+	double total_computation_time;
			
 
				+	double total_communication_time;
			
 
				+
			
 
				+	/* in case workers are blocked on the queue, signaling on that 
			
 
				+	  condition must unblock them, even if there is no available task */
			
 
				+	pthread_cond_t activity_cond;
			
 
				+	pthread_mutex_t activity_mutex;
			
 
				+};
			
 
				+
			
 
				+struct machine_config_s;
			
 
				+
			
 
				+void setup_queues(void (*init_queue_design)(void),
			
 
				+                  struct jobq_s *(*func_init_queue)(void),
			
 
				+                  struct machine_config_s *config);
			
 
				+
			
 
				+struct jobq_s *get_local_queue(void);
			
 
				+void set_local_queue(struct jobq_s *jobq);
			
 
				+
			
 
				+
			
 
				+#endif // __QUEUES_H__
			
--- a/src/core/mechanisms/stack_queues.c
+++ b/src/core/mechanisms/stack_queues.c
@@ -0,0 +1,228 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <core/mechanisms/stack_queues.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+/* keep track of the total number of jobs to be scheduled to avoid infinite 
			
 
				+ * polling when there are really few jobs in the overall queue */
			
 
				+static unsigned total_number_of_jobs;
			
 
				+
			
 
				+static pthread_cond_t *sched_cond;
			
 
				+static pthread_mutex_t *sched_mutex;
			
 
				+
			
 
				+void init_stack_queues_mechanisms(void)
			
 
				+{
			
 
				+	total_number_of_jobs = 0;
			
 
				+
			
 
				+	struct sched_policy_s *sched = get_sched_policy();
			
 
				+
			
 
				+	/* to access them more easily, we keep their address in local variables */
			
 
				+	sched_cond = &sched->sched_activity_cond;
			
 
				+	sched_mutex = &sched->sched_activity_mutex;
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *create_stack(void)
			
 
				+{
			
 
				+	struct jobq_s *jobq;
			
 
				+	jobq = malloc(sizeof(struct jobq_s));
			
 
				+
			
 
				+	struct stack_jobq_s *stack;
			
 
				+	stack = malloc(sizeof(struct stack_jobq_s));
			
 
				+
			
 
				+	pthread_mutex_init(&jobq->activity_mutex, NULL);
			
 
				+	pthread_cond_init(&jobq->activity_cond, NULL);
			
 
				+
			
 
				+	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				+	stack->jobq = job_list_new();
			
 
				+	stack->njobs = 0;
			
 
				+	stack->nprocessed = 0;
			
 
				+
			
 
				+	stack->exp_start = timing_now()/1000000;
			
 
				+	stack->exp_len = 0.0;
			
 
				+	stack->exp_end = stack->exp_start;
			
 
				+
			
 
				+	jobq->queue = stack;
			
 
				+
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+unsigned get_total_njobs_stacks(void)
			
 
				+{
			
 
				+	return total_number_of_jobs;
			
 
				+}
			
 
				+
			
 
				+unsigned get_stack_njobs(struct jobq_s *q)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	return stack_queue->njobs;
			
 
				+}
			
 
				+
			
 
				+unsigned get_stack_nprocessed(struct jobq_s *q)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	return stack_queue->nprocessed;
			
 
				+}
			
 
				+
			
 
				+void stack_push_prio_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+#ifndef NO_PRIO
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	total_number_of_jobs++;
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(task, 0);
			
 
				+	job_list_push_back(stack_queue->jobq, task);
			
 
				+	deque_queue->njobs++;
			
 
				+	deque_queue->nprocessed++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+#else
			
 
				+	stack_push_task(q, task);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void stack_push_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	/* if anyone is blocked on the entire machine, wake it up */
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	total_number_of_jobs++;
			
 
				+	pthread_cond_signal(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+
			
 
				+	/* wake people waiting locally */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	TRACE_JOB_PUSH(task, 0);
			
 
				+	job_list_push_front(stack_queue->jobq, task);
			
 
				+	deque_queue->njobs++;
			
 
				+	deque_queue->nprocessed++;
			
 
				+
			
 
				+	pthread_cond_signal(&q->activity_cond);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+}
			
 
				+
			
 
				+job_t stack_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	/* block until some task is available in that queue */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (stack_queue->njobs == 0)
			
 
				+		pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
			
 
				+
			
 
				+	if (stack_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_back(stack_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		stack_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* for work stealing, typically */
			
 
				+job_t stack_non_blocking_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j = NULL;
			
 
				+
			
 
				+	STARPU_ASSERT(q);
			
 
				+	struct stack_jobq_s *stack_queue = q->queue;
			
 
				+
			
 
				+	/* block until some task is available in that queue */
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+
			
 
				+	if (stack_queue->njobs > 0) 
			
 
				+	{
			
 
				+		/* there is a task */
			
 
				+		j = job_list_pop_back(stack_queue->jobq);
			
 
				+	
			
 
				+		STARPU_ASSERT(j);
			
 
				+		stack_queue->njobs--;
			
 
				+		
			
 
				+		TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		 * there remained some work and will soon discover it is not true */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+		total_number_of_jobs--;
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+	
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+job_t stack_non_blocking_pop_task_if_job_exists(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j;
			
 
				+
			
 
				+	j = stack_non_blocking_pop_task(q);
			
 
				+
			
 
				+	if (!j) {
			
 
				+		/* there is no job at all in the entire system : go to sleep ! */
			
 
				+
			
 
				+		/* that wait is not an absolute sign that there is some work 
			
 
				+		 * if there is some, the thread should be awoken, but if there is none 
			
 
				+		 * at the moment it is awoken, it may simply poll a limited number of 
			
 
				+		 * times and just get back to sleep */
			
 
				+		pthread_mutex_lock(sched_mutex);
			
 
				+
			
 
				+		if (total_number_of_jobs == 0)
			
 
				+			pthread_cond_wait(sched_cond, sched_mutex);
			
 
				+
			
 
				+		pthread_mutex_unlock(sched_mutex);
			
 
				+	}
			
 
				+
			
 
				+	return j;
			
 
				+}
			
--- a/src/core/mechanisms/stack_queues.h
+++ b/src/core/mechanisms/stack_queues.h
@@ -0,0 +1,55 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STACK_QUEUES_H__
			
 
				+#define __STACK_QUEUES_H__
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+
			
 
				+struct stack_jobq_s {
			
 
				+	/* the actual list */
			
 
				+	job_list_t jobq;
			
 
				+
			
 
				+	/* the number of tasks currently in the queue */
			
 
				+	unsigned njobs;
			
 
				+
			
 
				+	/* the number of tasks that were processed */
			
 
				+	unsigned nprocessed;
			
 
				+
			
 
				+	/* only meaningful if the queue is only used by a single worker */
			
 
				+	double exp_start;
			
 
				+	double exp_end;
			
 
				+	double exp_len;
			
 
				+};
			
 
				+
			
 
				+struct jobq_s *create_stack(void);
			
 
				+
			
 
				+void stack_push_task(struct jobq_s *q, job_t task);
			
 
				+
			
 
				+void stack_push_prio_task(struct jobq_s *q, job_t task);
			
 
				+
			
 
				+job_t stack_pop_task(struct jobq_s *q);
			
 
				+job_t stack_non_blocking_pop_task(struct jobq_s *q);
			
 
				+job_t stack_non_blocking_pop_task_if_job_exists(struct jobq_s *q);
			
 
				+
			
 
				+void init_stack_queues_mechanisms(void);
			
 
				+
			
 
				+
			
 
				+unsigned get_stack_njobs(struct jobq_s *q);
			
 
				+unsigned get_stack_nprocessed(struct jobq_s *q);
			
 
				+
			
 
				+
			
 
				+#endif // __STACK_QUEUES_H__
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -0,0 +1,137 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <unistd.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				+
			
 
				+/*
			
 
				+ * PER ARCH model
			
 
				+ */
			
 
				+
			
 
				+static double per_arch_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
			
 
				+{
			
 
				+	double exp = -1.0;
			
 
				+	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
			
 
				+	
			
 
				+	if (!model->is_loaded)
			
 
				+	{
			
 
				+		if (starpu_get_env_number("CALIBRATE") != -1)
			
 
				+		{
			
 
				+			fprintf(stderr, "CALIBRATE model %s\n", model->symbol);
			
 
				+			model->benchmarking = 1;
			
 
				+		}
			
 
				+		else {
			
 
				+			model->benchmarking = 0;
			
 
				+		}
			
 
				+		
			
 
				+		register_model(model);
			
 
				+		model->is_loaded = 1;
			
 
				+	}
			
 
				+
			
 
				+	per_arch_cost_model = model->per_arch[arch].cost_model;
			
 
				+
			
 
				+	if (per_arch_cost_model)
			
 
				+		exp = per_arch_cost_model(j->task->buffers);
			
 
				+
			
 
				+	return exp;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Common model
			
 
				+ */
			
 
				+
			
 
				+static double common_job_expected_length(struct starpu_perfmodel_t *model, uint32_t who, struct job_s *j)
			
 
				+{
			
 
				+	double exp;
			
 
				+
			
 
				+	if (model->cost_model) {
			
 
				+		float alpha;
			
 
				+		exp = model->cost_model(j->task->buffers);
			
 
				+		switch (who) {
			
 
				+			case CORE:
			
 
				+				alpha = CORE_ALPHA;
			
 
				+				break;
			
 
				+			case CUDA:
			
 
				+				alpha = CUDA_ALPHA;
			
 
				+				break;
			
 
				+			default:
			
 
				+				/* perhaps there are various worker types on that queue */
			
 
				+				alpha = 1.0; // this value is not significant ...
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		STARPU_ASSERT(alpha != 0.0f);
			
 
				+
			
 
				+		return (exp/alpha);
			
 
				+	}
			
 
				+
			
 
				+	return -1.0;
			
 
				+}
			
 
				+
			
 
				+double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archtype arch)
			
 
				+{
			
 
				+	struct starpu_perfmodel_t *model = j->task->cl->model;
			
 
				+
			
 
				+	if (model) {
			
 
				+		switch (model->type) {
			
 
				+			case PER_ARCH:
			
 
				+				return per_arch_job_expected_length(model, arch, j);
			
 
				+
			
 
				+			case COMMON:
			
 
				+				return common_job_expected_length(model, who, j);
			
 
				+
			
 
				+			case HISTORY_BASED:
			
 
				+				return history_based_job_expected_length(model, arch, j);
			
 
				+
			
 
				+			case REGRESSION_BASED:
			
 
				+				return regression_based_job_expected_length(model, arch, j);
			
 
				+
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+		};
			
 
				+	}
			
 
				+
			
 
				+	/* no model was found */
			
 
				+	return 0.0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Data transfer performance modeling */
			
 
				+double data_expected_penalty(struct jobq_s *q, struct job_s *j)
			
 
				+{
			
 
				+	uint32_t memory_node = q->memory_node;
			
 
				+	unsigned nbuffers = j->task->cl->nbuffers;
			
 
				+	unsigned buffer;
			
 
				+
			
 
				+	double penalty = 0.0;
			
 
				+
			
 
				+	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				+	{
			
 
				+		data_state *state = j->task->buffers[buffer].state;
			
 
				+
			
 
				+		if (!is_data_present_or_requested(state, memory_node))
			
 
				+		{
			
 
				+			/* TODO */
			
 
				+			penalty += 1000.0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return penalty;
			
 
				+}
			
 
				+
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -0,0 +1,95 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PERFMODEL_H__
			
 
				+#define __PERFMODEL_H__
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <starpu-perfmodel.h>
			
 
				+//#include <core/jobs.h>
			
 
				+#include <common/htable32.h>
			
 
				+//#include <core/workers.h>
			
 
				+#include <starpu-mutex.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+struct starpu_buffer_descr_t;
			
 
				+struct jobq_s;
			
 
				+struct job_s;
			
 
				+enum archtype;
			
 
				+enum starpu_perf_archtype;
			
 
				+
			
 
				+struct starpu_history_entry_t {
			
 
				+	//double measured;
			
 
				+	
			
 
				+	/* mean_n = 1/n sum */
			
 
				+	double mean;
			
 
				+
			
 
				+	/* n dev_n = sum2 - 1/n (sum)^2 */
			
 
				+	double deviation;
			
 
				+
			
 
				+	/* sum of samples */
			
 
				+	double sum;
			
 
				+
			
 
				+	/* sum of samples^2 */
			
 
				+	double sum2;
			
 
				+
			
 
				+//	/* sum of ln(measured) */
			
 
				+//	double sumlny;
			
 
				+//
			
 
				+//	/* sum of ln(size) */
			
 
				+//	double sumlnx;
			
 
				+//	double sumlnx2;
			
 
				+//
			
 
				+//	/* sum of ln(size) ln(measured) */
			
 
				+//	double sumlnxlny;
			
 
				+//
			
 
				+	unsigned nsample;
			
 
				+
			
 
				+	uint32_t footprint;
			
 
				+	size_t size; /* in bytes */
			
 
				+};
			
 
				+
			
 
				+struct starpu_history_list_t {
			
 
				+	struct starpu_history_list_t *next;
			
 
				+	struct starpu_history_entry_t *entry;
			
 
				+};
			
 
				+
			
 
				+struct starpu_model_list_t {
			
 
				+	struct starpu_model_list_t *next;
			
 
				+	struct starpu_perfmodel_t *model;
			
 
				+};
			
 
				+
			
 
				+//
			
 
				+///* File format */
			
 
				+//struct model_file_format {
			
 
				+//	unsigned ncore_entries;
			
 
				+//	unsigned ncuda_entries;
			
 
				+//	/* contains core entries, then cuda ones */
			
 
				+//	struct starpu_history_entry_t entries[];
			
 
				+//}
			
 
				+
			
 
				+double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j);
			
 
				+void register_model(struct starpu_perfmodel_t *model);
			
 
				+void dump_registered_models(void);
			
 
				+
			
 
				+double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archtype arch);
			
 
				+double regression_based_job_expected_length(struct starpu_perfmodel_t *model,
			
 
				+		uint32_t who, struct job_s *j);
			
 
				+void update_perfmodel_history(struct job_s *j, enum starpu_perf_archtype arch, double measured);
			
 
				+
			
 
				+double data_expected_penalty(struct jobq_s *q, struct job_s *j);
			
 
				+
			
 
				+#endif // __PERFMODEL_H__
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -0,0 +1,513 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <unistd.h>
			
 
				+#include <sys/stat.h>
			
 
				+#include <unistd.h>
			
 
				+#include <errno.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <starpu-mutex.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				+#include <core/perfmodel/regression.h>
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+/*
			
 
				+ * History based model
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+static void insert_history_entry(struct starpu_history_entry_t *entry, struct starpu_history_list_t **list, struct starpu_htbl32_node_s **history_ptr)
			
 
				+{
			
 
				+	struct starpu_history_list_t *link;
			
 
				+	struct starpu_history_entry_t *old;
			
 
				+
			
 
				+	link = malloc(sizeof(struct starpu_history_list_t));
			
 
				+	link->next = *list;
			
 
				+	link->entry = entry;
			
 
				+	*list = link;
			
 
				+
			
 
				+	old = htbl_insert_32(history_ptr, entry->footprint, entry);
			
 
				+	/* that may fail in case there is some concurrency issue */
			
 
				+	STARPU_ASSERT(old == NULL);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void dump_reg_model(FILE *f, struct starpu_regression_model_t *reg_model)
			
 
				+{
			
 
				+	fprintf(f, "%le\t%le\t%le\t%le\t%le\t%le\t%d\n", reg_model->sumlnx, reg_model->sumlnx2, reg_model->sumlny, reg_model->sumlnxlny, reg_model->alpha, reg_model->beta, reg_model->nsample);
			
 
				+}
			
 
				+
			
 
				+static void scan_reg_model(FILE *f, struct starpu_regression_model_t *reg_model)
			
 
				+{
			
 
				+	int res;
			
 
				+
			
 
				+	res = fscanf(f, "%le\t%le\t%le\t%le\t%le\t%le\t%d\n", &reg_model->sumlnx, &reg_model->sumlnx2, &reg_model->sumlny, &reg_model->sumlnxlny, &reg_model->alpha, &reg_model->beta, &reg_model->nsample);
			
 
				+	STARPU_ASSERT(res == 7);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void dump_history_entry(FILE *f, struct starpu_history_entry_t *entry)
			
 
				+{
			
 
				+	fprintf(f, "%x\t%zu\t%le\t%le\t%le\t%le\t%d\n", entry->footprint, entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
			
 
				+}
			
 
				+
			
 
				+static void scan_history_entry(FILE *f, struct starpu_history_entry_t *entry)
			
 
				+{
			
 
				+	int res;
			
 
				+
			
 
				+	res = fscanf(f, "%x\t%zu\t%le\t%le\t%le\t%le\t%d\n", &entry->footprint, &entry->size, &entry->mean, &entry->deviation, &entry->sum, &entry->sum2, &entry->nsample);
			
 
				+	STARPU_ASSERT(res == 7);
			
 
				+}
			
 
				+
			
 
				+static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_t *per_arch_model, unsigned scan_history)
			
 
				+{
			
 
				+	unsigned nentries;
			
 
				+
			
 
				+	int res = fscanf(f, "%d\n", &nentries);
			
 
				+	STARPU_ASSERT(res == 1);
			
 
				+
			
 
				+	scan_reg_model(f, &per_arch_model->regression);
			
 
				+
			
 
				+	res = fscanf(f, "%le\t%le\t%le\n", 
			
 
				+		&per_arch_model->regression.a,
			
 
				+		&per_arch_model->regression.b,
			
 
				+		&per_arch_model->regression.c);
			
 
				+	STARPU_ASSERT(res == 3);
			
 
				+
			
 
				+	if (isnan(per_arch_model->regression.a)||isnan(per_arch_model->regression.b)||isnan(per_arch_model->regression.c))
			
 
				+	{
			
 
				+		per_arch_model->regression.valid = 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		per_arch_model->regression.valid = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (!scan_history)
			
 
				+		return;
			
 
				+
			
 
				+	/* parse core entries */
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nentries; i++) {
			
 
				+		struct starpu_history_entry_t *entry = malloc(sizeof(struct starpu_history_entry_t));
			
 
				+		STARPU_ASSERT(entry);
			
 
				+
			
 
				+		scan_history_entry(f, entry);
			
 
				+		
			
 
				+		/* insert the entry in the hashtable and the list structures  */
			
 
				+		insert_history_entry(entry, &per_arch_model->list, &per_arch_model->history);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				+{
			
 
				+	parse_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT], scan_history);
			
 
				+	parse_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT], scan_history);
			
 
				+}
			
 
				+
			
 
				+static void dump_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_t *per_arch_model)
			
 
				+{
			
 
				+	/* count the number of elements in the lists */
			
 
				+	struct starpu_history_list_t *ptr;
			
 
				+	unsigned nentries = 0;
			
 
				+
			
 
				+	ptr = per_arch_model->list;
			
 
				+	while(ptr) {
			
 
				+		nentries++;
			
 
				+		ptr = ptr->next;
			
 
				+	}
			
 
				+
			
 
				+	/* header */
			
 
				+	fprintf(f, "%d\n", nentries);
			
 
				+
			
 
				+	dump_reg_model(f, &per_arch_model->regression);
			
 
				+
			
 
				+	double a,b,c;
			
 
				+	regression_non_linear_power(per_arch_model->list, &a, &b, &c);
			
 
				+	fprintf(f, "%le\t%le\t%le\n", a, b, c);
			
 
				+
			
 
				+	ptr = per_arch_model->list;
			
 
				+	while (ptr) {
			
 
				+		//memcpy(&entries_array[i++], ptr->entry, sizeof(struct starpu_history_entry_t));
			
 
				+		dump_history_entry(f, ptr->entry);
			
 
				+		ptr = ptr->next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	dump_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT]);
			
 
				+	dump_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT]);
			
 
				+}
			
 
				+
			
 
				+static void initialize_per_arch_model(struct starpu_per_arch_perfmodel_t *per_arch_model)
			
 
				+{
			
 
				+	per_arch_model->history = NULL;
			
 
				+	per_arch_model->list = NULL;
			
 
				+}
			
 
				+
			
 
				+static void initialize_model(struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	initialize_per_arch_model(&model->per_arch[STARPU_CORE_DEFAULT]);
			
 
				+	initialize_per_arch_model(&model->per_arch[STARPU_CUDA_DEFAULT]);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_model_list_t *registered_models = NULL;
			
 
				+//static unsigned debug_modelid = 0;
			
 
				+
			
 
				+#ifdef MODEL_DEBUG
			
 
				+static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *arch, char *path, size_t maxlen)
			
 
				+{
			
 
				+	strncpy(path, PERF_MODEL_DIR, maxlen);
			
 
				+	strncat(path, model->symbol, maxlen);
			
 
				+	
			
 
				+	char hostname[32];
			
 
				+	gethostname(hostname, 32);
			
 
				+	strncat(path, ".", maxlen);
			
 
				+	strncat(path, hostname, maxlen);
			
 
				+	strncat(path, ".", maxlen);
			
 
				+	strncat(path, arch, maxlen);
			
 
				+	strncat(path, ".debug", maxlen);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+void register_model(struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	/* add the model to a linked list */
			
 
				+	struct starpu_model_list_t *node = malloc(sizeof(struct starpu_model_list_t));
			
 
				+
			
 
				+	node->model = model;
			
 
				+	//model->debug_modelid = debug_modelid++;
			
 
				+
			
 
				+	/* put this model at the beginning of the list */
			
 
				+	node->next = registered_models;
			
 
				+	registered_models = node;
			
 
				+
			
 
				+#ifdef MODEL_DEBUG
			
 
				+	char debugpath[256];
			
 
				+	get_model_debug_path(model, "cuda", debugpath, 256);
			
 
				+	model->per_arch[STARPU_CUDA_DEFAULT].debug_file = fopen(debugpath, "a+");
			
 
				+	STARPU_ASSERT(model->per_arch[STARPU_CUDA_DEFAULT].debug_file);
			
 
				+
			
 
				+	get_model_debug_path(model, "core", debugpath, 256);
			
 
				+	model->per_arch[STARPU_CORE_DEFAULT].debug_file = fopen(debugpath, "a+");
			
 
				+	STARPU_ASSERT(model->per_arch[STARPU_CORE_DEFAULT].debug_file);
			
 
				+#endif
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t maxlen)
			
 
				+{
			
 
				+	strncpy(path, PERF_MODEL_DIR, maxlen);
			
 
				+	strncat(path, model->symbol, maxlen);
			
 
				+	
			
 
				+	char hostname[32];
			
 
				+	gethostname(hostname, 32);
			
 
				+	strncat(path, ".", maxlen);
			
 
				+	strncat(path, hostname, maxlen);
			
 
				+}
			
 
				+
			
 
				+void save_history_based_model(struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	STARPU_ASSERT(model);
			
 
				+	STARPU_ASSERT(model->symbol);
			
 
				+
			
 
				+	/* TODO checks */
			
 
				+
			
 
				+	/* filename = $PERF_MODEL_DIR/symbol.hostname */
			
 
				+	char path[256];
			
 
				+	get_model_path(model, path, 256);
			
 
				+
			
 
				+#ifdef VERBOSE
			
 
				+	fprintf(stderr, "Opening performance model file %s for model %s\n", path, model->symbol);
			
 
				+#endif
			
 
				+
			
 
				+	/* overwrite existing file, or create it */
			
 
				+	FILE *f;
			
 
				+	f = fopen(path, "w+");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+	dump_model_file(f, model);
			
 
				+
			
 
				+	fclose(f);
			
 
				+
			
 
				+#ifdef DEBUG_MODEL
			
 
				+	fclose(model->cuda_debug_file);
			
 
				+	fclose(model->core_debug_file);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void dump_registered_models(void)
			
 
				+{
			
 
				+	struct starpu_model_list_t *node;
			
 
				+	node = registered_models;
			
 
				+
			
 
				+#ifdef VERBOSE
			
 
				+	fprintf(stderr, "DUMP MODELS !\n");
			
 
				+#endif
			
 
				+
			
 
				+	while (node) {
			
 
				+		save_history_based_model(node->model);		
			
 
				+		node = node->next;
			
 
				+
			
 
				+		/* XXX free node */
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int directory_existence_was_tested = 0;
			
 
				+
			
 
				+static void create_sampling_directory_if_needed(void)
			
 
				+{
			
 
				+	/* Testing if a directory exists and creating it otherwise 
			
 
				+	   may not be safe: it is possible that the permission are
			
 
				+	   changed in between. Instead, we create it and check if
			
 
				+	   it already existed before */
			
 
				+	int ret;
			
 
				+	ret = mkdir(PERF_MODEL_DIR, S_IRWXU);
			
 
				+	if (ret == -1)
			
 
				+	{
			
 
				+		STARPU_ASSERT(errno == EEXIST);
			
 
				+
			
 
				+		/* make sure that it is actually a directory */
			
 
				+		struct stat sb;
			
 
				+		stat(PERF_MODEL_DIR, &sb);
			
 
				+		STARPU_ASSERT(S_ISDIR(sb.st_mode));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				+{
			
 
				+	STARPU_ASSERT(model);
			
 
				+	STARPU_ASSERT(model->symbol);
			
 
				+
			
 
				+	/* XXX we assume the lock is implicitely initialized (taken = 0) */
			
 
				+	//init_mutex(&model->model_mutex);
			
 
				+	take_mutex(&model->model_mutex);
			
 
				+
			
 
				+	/* perhaps some other thread got in before ... */
			
 
				+	if (!model->is_loaded)
			
 
				+	{
			
 
				+		/* make sure the performance model directory exists (or create it) */
			
 
				+		if (!directory_existence_was_tested)
			
 
				+		{
			
 
				+			create_sampling_directory_if_needed();
			
 
				+			directory_existence_was_tested = 1;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * We need to keep track of all the model that were opened so that we can 
			
 
				+		 * possibly update them at runtime termination ...
			
 
				+		 */
			
 
				+		register_model(model);
			
 
				+	
			
 
				+		char path[256];
			
 
				+		get_model_path(model, path, 256);
			
 
				+	
			
 
				+#ifdef VERBOSE
			
 
				+		fprintf(stderr, "Opening performance model file %s for model %s\n", path, model->symbol);
			
 
				+#endif
			
 
				+	
			
 
				+		/* try to open an existing file and load it */
			
 
				+		int res;
			
 
				+		res = access(path, F_OK); 
			
 
				+		if (res == 0) {
			
 
				+		//	fprintf(stderr, "File exists !\n");
			
 
				+	
			
 
				+			FILE *f;
			
 
				+			f = fopen(path, "r");
			
 
				+			STARPU_ASSERT(f);
			
 
				+	
			
 
				+			parse_model_file(f, model, scan_history);
			
 
				+	
			
 
				+			fclose(f);
			
 
				+		}
			
 
				+		else {
			
 
				+			//fprintf(stderr, "File does not exists !\n");
			
 
				+			initialize_model(model);
			
 
				+		}
			
 
				+	
			
 
				+	
			
 
				+		if (starpu_get_env_number("CALIBRATE") != -1)
			
 
				+		{
			
 
				+			fprintf(stderr, "CALIBRATE model %s\n", model->symbol);
			
 
				+			model->benchmarking = 1;
			
 
				+		}
			
 
				+		else {
			
 
				+			model->benchmarking = 0;
			
 
				+		}
			
 
				+	
			
 
				+		model->is_loaded = 1;
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&model->model_mutex);
			
 
				+}
			
 
				+
			
 
				+double regression_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
			
 
				+{
			
 
				+	double exp = -1.0;
			
 
				+	size_t size = job_get_data_size(j);
			
 
				+	struct starpu_regression_model_t *regmodel;
			
 
				+
			
 
				+	if (!model->is_loaded)
			
 
				+		load_history_based_model(model, 0);
			
 
				+
			
 
				+	regmodel = &model->per_arch[arch].regression;
			
 
				+
			
 
				+	if (regmodel->valid)
			
 
				+		exp = regmodel->a*pow(size, regmodel->b) + regmodel->c;
			
 
				+
			
 
				+	return exp;
			
 
				+}
			
 
				+
			
 
				+double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
			
 
				+{
			
 
				+	double exp;
			
 
				+	struct starpu_per_arch_perfmodel_t *per_arch_model;
			
 
				+	struct starpu_history_entry_t *entry;
			
 
				+	struct starpu_htbl32_node_s *history;
			
 
				+
			
 
				+	if (!model->is_loaded)
			
 
				+		load_history_based_model(model, 1);
			
 
				+
			
 
				+	if (!j->footprint_is_computed)
			
 
				+		compute_buffers_footprint(j);
			
 
				+		
			
 
				+	uint32_t key = j->footprint;
			
 
				+
			
 
				+	per_arch_model = &model->per_arch[arch];
			
 
				+
			
 
				+	history = per_arch_model->history;
			
 
				+	if (!history)
			
 
				+		return -1.0;
			
 
				+
			
 
				+	take_mutex(&model->model_mutex);
			
 
				+	entry = htbl_search_32(history, key);
			
 
				+	release_mutex(&model->model_mutex);
			
 
				+
			
 
				+	exp = entry?entry->mean:-1.0;
			
 
				+
			
 
				+	return exp;
			
 
				+}
			
 
				+
			
 
				+void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double measured)
			
 
				+{
			
 
				+	struct starpu_perfmodel_t *model = j->task->cl->model;
			
 
				+
			
 
				+	if (model)
			
 
				+	{
			
 
				+		struct starpu_per_arch_perfmodel_t *per_arch_model = &model->per_arch[arch];
			
 
				+
			
 
				+		if (model->type == HISTORY_BASED || model->type == REGRESSION_BASED)
			
 
				+		{
			
 
				+			uint32_t key = j->footprint;
			
 
				+			struct starpu_history_entry_t *entry;
			
 
				+
			
 
				+			struct starpu_htbl32_node_s *history;
			
 
				+			struct starpu_htbl32_node_s **history_ptr;
			
 
				+			struct starpu_regression_model_t *reg_model;
			
 
				+
			
 
				+			struct starpu_history_list_t **list;
			
 
				+
			
 
				+
			
 
				+			history = per_arch_model->history;
			
 
				+			history_ptr = &per_arch_model->history;
			
 
				+			reg_model = &per_arch_model->regression;
			
 
				+			list = &per_arch_model->list;
			
 
				+
			
 
				+			take_mutex(&model->model_mutex);
			
 
				+	
			
 
				+				entry = htbl_search_32(history, key);
			
 
				+	
			
 
				+				if (!entry)
			
 
				+				{
			
 
				+					/* this is the first entry with such a footprint */
			
 
				+					entry = malloc(sizeof(struct starpu_history_entry_t));
			
 
				+					STARPU_ASSERT(entry);
			
 
				+						entry->mean = measured;
			
 
				+						entry->sum = measured;
			
 
				+	
			
 
				+						entry->deviation = 0.0;
			
 
				+						entry->sum2 = measured*measured;
			
 
				+	
			
 
				+						entry->size = job_get_data_size(j);
			
 
				+	
			
 
				+						entry->footprint = key;
			
 
				+						entry->nsample = 1;
			
 
				+	
			
 
				+					insert_history_entry(entry, list, history_ptr);
			
 
				+	
			
 
				+				}
			
 
				+				else {
			
 
				+					/* there is already some entry with the same footprint */
			
 
				+					entry->sum += measured;
			
 
				+					entry->sum2 += measured*measured;
			
 
				+					entry->nsample++;
			
 
				+	
			
 
				+					unsigned n = entry->nsample;
			
 
				+					entry->mean = entry->sum / n;
			
 
				+					entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
			
 
				+				}
			
 
				+			
			
 
				+				STARPU_ASSERT(entry);
			
 
				+			
			
 
				+			/* update the regression model as well */
			
 
				+			double logy, logx;
			
 
				+			logx = logl(entry->size);
			
 
				+			logy = logl(measured);
			
 
				+
			
 
				+			reg_model->sumlnx += logx;
			
 
				+			reg_model->sumlnx2 += logx*logx;
			
 
				+			reg_model->sumlny += logy;
			
 
				+			reg_model->sumlnxlny += logx*logy;
			
 
				+			reg_model->nsample++;
			
 
				+
			
 
				+			unsigned n = reg_model->nsample;
			
 
				+			
			
 
				+			double num = (n*reg_model->sumlnxlny - reg_model->sumlnx*reg_model->sumlny);
			
 
				+			double denom = (n*reg_model->sumlnx2 - reg_model->sumlnx*reg_model->sumlnx);
			
 
				+
			
 
				+			reg_model->beta = num/denom;
			
 
				+			reg_model->alpha = expl((reg_model->sumlny - reg_model->beta*reg_model->sumlnx)/n);
			
 
				+			
			
 
				+			release_mutex(&model->model_mutex);
			
 
				+		}
			
 
				+
			
 
				+#ifdef MODEL_DEBUG
			
 
				+		FILE * debug_file = per_arch_model->debug_file;
			
 
				+
			
 
				+		take_mutex(&model->model_mutex);
			
 
				+
			
 
				+		fprintf(debug_file, "%lf\t", measured);
			
 
				+		unsigned i;
			
 
				+			
			
 
				+		for (i = 0; i < j->nbuffers; i++)
			
 
				+		{
			
 
				+			data_state *state = j->buffers[i].state;
			
 
				+
			
 
				+			STARPU_ASSERT(state->ops);
			
 
				+			STARPU_ASSERT(state->ops->display);
			
 
				+			state->ops->display(state, debug_file);
			
 
				+		}
			
 
				+		fprintf(debug_file, "\n");	
			
 
				+
			
 
				+
			
 
				+		release_mutex(&model->model_mutex);
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
--- a/src/core/perfmodel/regression.c
+++ b/src/core/perfmodel/regression.c
@@ -0,0 +1,225 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/perfmodel/regression.h>
			
 
				+
			
 
				+#define MAXREGITER	1000
			
 
				+#define EPS 1.0e-10
			
 
				+
			
 
				+//#define MIN(a,b) ((a)<(b)?(a):(b))
			
 
				+
			
 
				+static double compute_b(double c, unsigned n, unsigned *x, double *y)
			
 
				+{
			
 
				+	double b;
			
 
				+
			
 
				+	/* X = log (x) , Y = log (y - c) */
			
 
				+	double sumxy = 0.0;
			
 
				+	double sumx = 0.0;
			
 
				+	double sumx2 = 0.0;
			
 
				+	double sumy = 0.0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		double xi = logl(x[i]);
			
 
				+		double yi = logl(y[i]-c);
			
 
				+
			
 
				+		sumxy += xi*yi;
			
 
				+		sumx += xi;
			
 
				+		sumx2 += xi*xi;
			
 
				+		sumy += yi;
			
 
				+	}
			
 
				+
			
 
				+	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
			
 
				+
			
 
				+	return b;
			
 
				+}
			
 
				+
			
 
				+static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
			
 
				+{
			
 
				+	double a;
			
 
				+
			
 
				+	/* X = log (x) , Y = log (y - c) */
			
 
				+	double sumx = 0.0;
			
 
				+	double sumy = 0.0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		double xi = logl(x[i]);
			
 
				+		double yi = logl(y[i]-c);
			
 
				+
			
 
				+		sumx += xi;
			
 
				+		sumy += yi;
			
 
				+	}
			
 
				+
			
 
				+	a = (sumy - b*sumx) / n;
			
 
				+
			
 
				+	return a;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* returns r */
			
 
				+static double test_r(double c, unsigned n, unsigned *x, double *y)
			
 
				+{
			
 
				+	double r;
			
 
				+
			
 
				+//	printf("test c = %e\n", c);
			
 
				+
			
 
				+	/* X = log (x) , Y = log (y - c) */
			
 
				+	double sumxy = 0.0;
			
 
				+	double sumx = 0.0;
			
 
				+	double sumx2 = 0.0;
			
 
				+	double sumy = 0.0;
			
 
				+	double sumy2 = 0.0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		double xi = logl(x[i]);
			
 
				+		double yi = logl(y[i]-c);
			
 
				+
			
 
				+	//	printf("Xi = %e, Yi = %e\n", xi, yi);
			
 
				+
			
 
				+		sumxy += xi*yi;
			
 
				+		sumx += xi;
			
 
				+		sumx2 += xi*xi;
			
 
				+		sumy += yi;
			
 
				+		sumy2 += yi*yi;
			
 
				+	}
			
 
				+
			
 
				+	//printf("sumxy %e\n", sumxy);
			
 
				+	//printf("sumx %e\n", sumx);
			
 
				+	//printf("sumx2 %e\n", sumx2);
			
 
				+	//printf("sumy %e\n", sumy);
			
 
				+	//printf("sumy2 %e\n", sumy2);
			
 
				+
			
 
				+	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+static unsigned find_list_size(struct starpu_history_list_t *list_history)
			
 
				+{
			
 
				+	unsigned cnt = 0;
			
 
				+
			
 
				+	struct starpu_history_list_t *ptr = list_history;
			
 
				+	while (ptr) {
			
 
				+		cnt++;
			
 
				+		ptr = ptr->next;
			
 
				+	}
			
 
				+
			
 
				+	return cnt;
			
 
				+}
			
 
				+
			
 
				+static double find_list_min(double *y, unsigned n)
			
 
				+{
			
 
				+	double min = 1.0e30;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		min = STARPU_MIN(min, y[i]);
			
 
				+	}
			
 
				+
			
 
				+	return min;
			
 
				+}
			
 
				+
			
 
				+static void dump_list(unsigned *x, double *y, struct starpu_history_list_t *list_history)
			
 
				+{
			
 
				+	struct starpu_history_list_t *ptr = list_history;
			
 
				+	unsigned i = 0;
			
 
				+
			
 
				+	while (ptr) {
			
 
				+		x[i] = ptr->entry->size;
			
 
				+		y[i] = ptr->entry->mean;
			
 
				+
			
 
				+		ptr = ptr->next;
			
 
				+		i++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* y = ax^b + c 
			
 
				+ * 	return 0 if success, -1 otherwise
			
 
				+ * 	if success, a, b and c are modified
			
 
				+ * */
			
 
				+int regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c)
			
 
				+{
			
 
				+	unsigned n = find_list_size(ptr);
			
 
				+
			
 
				+	unsigned *x = malloc(n*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(x);
			
 
				+
			
 
				+	double *y = malloc(n*sizeof(double));
			
 
				+	STARPU_ASSERT(y);
			
 
				+
			
 
				+	dump_list(x, y, ptr);
			
 
				+
			
 
				+	double cmin = 0.0;
			
 
				+	double cmax = find_list_min(y, n);
			
 
				+	
			
 
				+	unsigned iter;
			
 
				+
			
 
				+	double err = 100000.0;
			
 
				+
			
 
				+	for (iter = 0; iter < MAXREGITER; iter++)
			
 
				+	{
			
 
				+		double c1, c2;
			
 
				+		double r1, r2;
			
 
				+		
			
 
				+		double radius = 0.01;
			
 
				+
			
 
				+		c1 = cmin + (0.5-radius)*(cmax - cmin);
			
 
				+		c2 = cmin + (0.5+radius)*(cmax - cmin);
			
 
				+
			
 
				+		r1 = test_r(c1, n, x, y);
			
 
				+		r2 = test_r(c2, n, x, y);
			
 
				+
			
 
				+		double err1, err2;
			
 
				+		err1 = fabsl(1.0 - r1);
			
 
				+		err2 = fabsl(1.0 - r2);
			
 
				+
			
 
				+		if (err1 < err2)
			
 
				+		{
			
 
				+			cmax = (cmin + cmax)/2;
			
 
				+		}
			
 
				+		else {
			
 
				+			/* 2 is better */
			
 
				+			cmin = (cmin + cmax)/2;
			
 
				+		}
			
 
				+
			
 
				+		if (fabsl(err - STARPU_MIN(err1, err2)) < EPS)
			
 
				+		{
			
 
				+			err = STARPU_MIN(err1, err2);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		err = STARPU_MIN(err1, err2);
			
 
				+	}
			
 
				+
			
 
				+	*c = (cmin + cmax)/2;
			
 
				+
			
 
				+	*b = compute_b(*c, n, x, y); 
			
 
				+	*a = expl(compute_a(*c, *b, n, x, y));
			
 
				+
			
 
				+	free(x);
			
 
				+	free(y);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/src/core/perfmodel/regression.h
+++ b/src/core/perfmodel/regression.h
@@ -0,0 +1,28 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __REGRESSION_H__
			
 
				+#define __REGRESSION_H__
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+int regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c);
			
 
				+
			
 
				+#endif // __REGRESSION_H__ 
			
--- a/src/core/policies/deque-modeling-policy-data-aware.c
+++ b/src/core/policies/deque-modeling-policy-data-aware.c
@@ -0,0 +1,226 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/deque-modeling-policy-data-aware.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+
			
 
				+static unsigned nworkers;
			
 
				+static struct jobq_s *queue_array[NMAXWORKERS];
			
 
				+
			
 
				+static job_t dmda_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	struct job_s *j;
			
 
				+
			
 
				+	j = fifo_pop_task(q);
			
 
				+	if (j) {
			
 
				+		struct fifo_jobq_s *fifo = q->queue;
			
 
				+		double model = j->predicted;
			
 
				+	
			
 
				+		fifo->exp_len -= model;
			
 
				+		fifo->exp_start = timing_now()/1000000 + model;
			
 
				+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+	}	
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+static void update_data_requests(struct jobq_s *q, struct job_s *j)
			
 
				+{
			
 
				+	uint32_t memory_node = q->memory_node;
			
 
				+	unsigned nbuffers = j->task->cl->nbuffers;
			
 
				+	unsigned buffer;
			
 
				+
			
 
				+	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				+	{
			
 
				+		data_state *state = j->task->buffers[buffer].state;
			
 
				+
			
 
				+		set_data_requested_flag_if_needed(state, memory_node);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int _dmda_push_task(struct jobq_s *q __attribute__ ((unused)) , job_t j, unsigned prio)
			
 
				+{
			
 
				+	/* find the queue */
			
 
				+	struct fifo_jobq_s *fifo;
			
 
				+	unsigned worker;
			
 
				+	int best = -1;
			
 
				+	
			
 
				+	/* this flag is set if the corresponding worker is selected because
			
 
				+	   there is no performance prediction available yet */
			
 
				+	int forced_best = -1;
			
 
				+
			
 
				+	double local_task_length[nworkers];
			
 
				+	double local_data_penalty[nworkers];
			
 
				+	double exp_end[nworkers];
			
 
				+
			
 
				+	double fitness[nworkers];
			
 
				+
			
 
				+	double best_exp_end = 10e240;
			
 
				+	double model_best = 0.0;
			
 
				+	double penality_best = 0.0;
			
 
				+
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		fifo = queue_array[worker]->queue;
			
 
				+
			
 
				+		/* XXX */
			
 
				+		fifo->exp_start = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
			
 
				+		fifo->exp_end = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
			
 
				+
			
 
				+		if ((queue_array[worker]->who & j->task->cl->where) == 0)
			
 
				+		{
			
 
				+			/* no one on that queue may execute this task */
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		local_task_length[worker] = job_expected_length(queue_array[worker]->who,
			
 
				+							j, queue_array[worker]->arch);
			
 
				+
			
 
				+		//local_data_penalty[worker] = 0;
			
 
				+		local_data_penalty[worker] = data_expected_penalty(queue_array[worker], j);
			
 
				+
			
 
				+		if (local_task_length[worker] == -1.0)
			
 
				+		{
			
 
				+			forced_best = worker;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
			
 
				+
			
 
				+		if (exp_end[worker] < best_exp_end)
			
 
				+		{
			
 
				+			/* a better solution was found */
			
 
				+			best_exp_end = exp_end[worker];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	double alpha = 1.0;
			
 
				+	double beta = 1.0;
			
 
				+
			
 
				+	double best_fitness = -1;
			
 
				+	
			
 
				+	if (forced_best == -1)
			
 
				+	{
			
 
				+		for (worker = 0; worker < nworkers; worker++)
			
 
				+		{
			
 
				+			fifo = queue_array[worker]->queue;
			
 
				+	
			
 
				+			if ((queue_array[worker]->who & j->task->cl->where) == 0)
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				+	
			
 
				+			fitness[worker] = alpha*(exp_end[worker] - best_exp_end) 
			
 
				+					+ beta*(local_data_penalty[worker]);
			
 
				+
			
 
				+			if (best == -1 || fitness[worker] < best_fitness)
			
 
				+			{
			
 
				+				/* we found a better solution */
			
 
				+				best_fitness = fitness[worker];
			
 
				+				best = worker;
			
 
				+
			
 
				+	//			fprintf(stderr, "best fitness (worker %d) %le = alpha*(%le) + beta(%le) \n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT(forced_best != -1 || best != -1);
			
 
				+	
			
 
				+	if (forced_best != -1)
			
 
				+	{
			
 
				+		/* there is no prediction available for that task
			
 
				+		 * with that arch we want to speed-up calibration time
			
 
				+		 * so we force this measurement */
			
 
				+		best = worker;
			
 
				+		model_best = 0.0;
			
 
				+		penality_best = 0.0;
			
 
				+	}
			
 
				+	else 
			
 
				+	{
			
 
				+		model_best = local_task_length[best];
			
 
				+		penality_best = local_data_penalty[best];
			
 
				+	}
			
 
				+
			
 
				+	/* we should now have the best worker in variable "best" */
			
 
				+	fifo = queue_array[best]->queue;
			
 
				+
			
 
				+	fifo->exp_end += model_best;
			
 
				+	fifo->exp_len += model_best;
			
 
				+
			
 
				+	j->predicted = model_best;
			
 
				+	j->penality = penality_best;
			
 
				+
			
 
				+	update_data_requests(queue_array[best], j);
			
 
				+
			
 
				+	if (prio) {
			
 
				+		return fifo_push_prio_task(queue_array[best], j);
			
 
				+	} else {
			
 
				+		return fifo_push_task(queue_array[best], j);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int dmda_push_prio_task(struct jobq_s *q, job_t j)
			
 
				+{
			
 
				+	return _dmda_push_task(q, j, 1);
			
 
				+}
			
 
				+
			
 
				+static int dmda_push_task(struct jobq_s *q, job_t j)
			
 
				+{
			
 
				+	if (j->task->priority == MAX_PRIO)
			
 
				+		return _dmda_push_task(q, j, 1);
			
 
				+
			
 
				+	return _dmda_push_task(q, j, 0);
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *init_dmda_fifo(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = create_fifo();
			
 
				+
			
 
				+	q->push_task = dmda_push_task; 
			
 
				+	q->push_prio_task = dmda_push_prio_task; 
			
 
				+	q->pop_task = dmda_pop_task;
			
 
				+	q->who = 0;
			
 
				+
			
 
				+	queue_array[nworkers++] = q;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+void initialize_dmda_policy(struct machine_config_s *config, 
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	nworkers = 0;
			
 
				+
			
 
				+	setup_queues(init_fifo_queues_mechanisms, init_dmda_fifo, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_dmda(struct sched_policy_s *policy __attribute__ ((unused)))
			
 
				+{
			
 
				+	struct jobq_s *queue;
			
 
				+	queue = pthread_getspecific(policy->local_queue_key);
			
 
				+
			
 
				+	if (!queue)
			
 
				+	{
			
 
				+		/* take one randomly as this *must* be for a push anyway XXX */
			
 
				+		queue = queue_array[0];
			
 
				+	}
			
 
				+
			
 
				+	return queue;
			
 
				+}
			
 
				+
			
--- a/src/core/policies/deque-modeling-policy-data-aware.h
+++ b/src/core/policies/deque-modeling-policy-data-aware.h
@@ -0,0 +1,29 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DEQUE_MODELING_POLICY_DATA_AWARE_H__
			
 
				+#define __DEQUE_MODELING_POLICY_DATA_AWARE_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+
			
 
				+void initialize_dmda_policy(struct machine_config_s *config,
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy);
			
 
				+
			
 
				+struct jobq_s *get_local_queue_dmda(struct sched_policy_s *policy __attribute__ ((unused)));
			
 
				+
			
 
				+#endif // __DEQUE_MODELING_POLICY_DATA_AWARE_H__
			
--- a/src/core/policies/deque-modeling-policy.c
+++ b/src/core/policies/deque-modeling-policy.c
@@ -0,0 +1,161 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/deque-modeling-policy.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+
			
 
				+static unsigned nworkers;
			
 
				+static struct jobq_s *queue_array[NMAXWORKERS];
			
 
				+
			
 
				+static job_t dm_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	struct job_s *j;
			
 
				+
			
 
				+	j = fifo_pop_task(q);
			
 
				+	if (j) {
			
 
				+		struct fifo_jobq_s *fifo = q->queue;
			
 
				+		double model = j->predicted;
			
 
				+	
			
 
				+		fifo->exp_len -= model;
			
 
				+		fifo->exp_start = timing_now()/1000000 + model;
			
 
				+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+	}	
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+static int _dm_push_task(struct jobq_s *q __attribute__ ((unused)), job_t j, unsigned prio)
			
 
				+{
			
 
				+	/* find the queue */
			
 
				+	struct fifo_jobq_s *fifo;
			
 
				+	unsigned worker;
			
 
				+	int best = -1;
			
 
				+
			
 
				+	double best_exp_end = 0.0;
			
 
				+	double model_best = 0.0;
			
 
				+
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		double exp_end;
			
 
				+		
			
 
				+		fifo = queue_array[worker]->queue;
			
 
				+
			
 
				+		/* XXX */
			
 
				+		fifo->exp_start = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
			
 
				+		fifo->exp_end = STARPU_MAX(fifo->exp_start, timing_now()/1000000);
			
 
				+
			
 
				+		if ((queue_array[worker]->who & j->task->cl->where) == 0)
			
 
				+		{
			
 
				+			/* no one on that queue may execute this task */
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		double local_length = job_expected_length(queue_array[worker]->who, j, queue_array[worker]->arch);
			
 
				+
			
 
				+		if (local_length == -1.0) 
			
 
				+		{
			
 
				+			/* there is no prediction available for that task
			
 
				+			 * with that arch we want to speed-up calibration time 
			
 
				+			 * so we force this measurement */
			
 
				+			/* XXX assert we are benchmarking ! */
			
 
				+			best = worker;
			
 
				+			model_best = 0.0;
			
 
				+			exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+
			
 
				+		exp_end = fifo->exp_start + fifo->exp_len + local_length;
			
 
				+
			
 
				+		if (best == -1 || exp_end < best_exp_end)
			
 
				+		{
			
 
				+			/* a better solution was found */
			
 
				+			best_exp_end = exp_end;
			
 
				+			best = worker;
			
 
				+			model_best = local_length;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	
			
 
				+	/* make sure someone coule execute that task ! */
			
 
				+	STARPU_ASSERT(best != -1);
			
 
				+
			
 
				+	/* we should now have the best worker in variable "best" */
			
 
				+	fifo = queue_array[best]->queue;
			
 
				+
			
 
				+	fifo->exp_end += model_best;
			
 
				+	fifo->exp_len += model_best;
			
 
				+
			
 
				+	j->predicted = model_best;
			
 
				+
			
 
				+	if (prio) {
			
 
				+		return fifo_push_prio_task(queue_array[best], j);
			
 
				+	} else {
			
 
				+		return fifo_push_task(queue_array[best], j);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int dm_push_prio_task(struct jobq_s *q, job_t j)
			
 
				+{
			
 
				+	return _dm_push_task(q, j, 1);
			
 
				+}
			
 
				+
			
 
				+static int dm_push_task(struct jobq_s *q, job_t j)
			
 
				+{
			
 
				+	if (j->task->priority == MAX_PRIO)
			
 
				+		return _dm_push_task(q, j, 1);
			
 
				+
			
 
				+	return _dm_push_task(q, j, 0);
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *init_dm_fifo(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = create_fifo();
			
 
				+
			
 
				+	q->push_task = dm_push_task; 
			
 
				+	q->push_prio_task = dm_push_prio_task; 
			
 
				+	q->pop_task = dm_pop_task;
			
 
				+	q->who = 0;
			
 
				+
			
 
				+	queue_array[nworkers++] = q;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+void initialize_dm_policy(struct machine_config_s *config, 
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	nworkers = 0;
			
 
				+
			
 
				+	setup_queues(init_fifo_queues_mechanisms, init_dm_fifo, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_dm(struct sched_policy_s *policy __attribute__ ((unused)))
			
 
				+{
			
 
				+	struct jobq_s *queue;
			
 
				+	queue = pthread_getspecific(policy->local_queue_key);
			
 
				+
			
 
				+	if (!queue)
			
 
				+	{
			
 
				+		/* take one randomly as this *must* be for a push anyway XXX */
			
 
				+		queue = queue_array[0];
			
 
				+	}
			
 
				+
			
 
				+	return queue;
			
 
				+}
			
 
				+
			
--- a/src/core/policies/deque-modeling-policy.h
+++ b/src/core/policies/deque-modeling-policy.h
@@ -0,0 +1,29 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DEQUE_MODELING_POLICY_H__
			
 
				+#define __DEQUE_MODELING_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+
			
 
				+void initialize_dm_policy(struct machine_config_s *config,
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy);
			
 
				+
			
 
				+struct jobq_s *get_local_queue_dm(struct sched_policy_s *policy __attribute__ ((unused)));
			
 
				+
			
 
				+#endif // __DEQUE_MODELING_POLICY_H__
			
--- a/src/core/policies/eager-central-policy.c
+++ b/src/core/policies/eager-central-policy.c
@@ -0,0 +1,58 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/eager-central-policy.h>
			
 
				+
			
 
				+/*
			
 
				+ *	This is just the trivial policy where every worker use the same
			
 
				+ *	JOB QUEUE.
			
 
				+ */
			
 
				+
			
 
				+/* the former is the actual queue, the latter some container */
			
 
				+static struct jobq_s *jobq;
			
 
				+
			
 
				+static void init_central_queue_design(void)
			
 
				+{
			
 
				+	/* there is only a single queue in that trivial design */
			
 
				+	jobq = create_fifo();
			
 
				+
			
 
				+	init_fifo_queues_mechanisms();
			
 
				+
			
 
				+	jobq->push_task = fifo_push_task;
			
 
				+	jobq->push_prio_task = fifo_push_prio_task;
			
 
				+	jobq->pop_task = fifo_pop_task;
			
 
				+
			
 
				+	jobq->pop_every_task = fifo_pop_every_task;
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *func_init_central_queue(void)
			
 
				+{
			
 
				+	/* once again, this is trivial */
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+void initialize_eager_center_policy(struct machine_config_s *config, 
			
 
				+	   __attribute__ ((unused)) struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	setup_queues(init_central_queue_design, func_init_central_queue, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_eager(struct sched_policy_s *policy 
			
 
				+					__attribute__ ((unused)))
			
 
				+{
			
 
				+	/* this is trivial for that strategy :) */
			
 
				+	return jobq;
			
 
				+}
			
--- a/src/core/policies/eager-central-policy.h
+++ b/src/core/policies/eager-central-policy.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __EAGER_CENTRAL_POLICY_H__
			
 
				+#define __EAGER_CENTRAL_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+
			
 
				+void initialize_eager_center_policy(struct machine_config_s *config, struct sched_policy_s *policy);
			
 
				+//void set_local_queue_eager(struct jobq_s *jobq);
			
 
				+struct jobq_s *get_local_queue_eager(struct sched_policy_s *policy);
			
 
				+
			
 
				+#endif // __EAGER_CENTRAL_POLICY_H__
			
--- a/src/core/policies/eager-central-priority-policy.c
+++ b/src/core/policies/eager-central-priority-policy.c
@@ -0,0 +1,52 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/eager-central-priority-policy.h>
			
 
				+
			
 
				+/* the former is the actual queue, the latter some container */
			
 
				+static struct jobq_s *jobq;
			
 
				+
			
 
				+static void init_priority_queue_design(void)
			
 
				+{
			
 
				+	/* only a single queue (even though there are several internaly) */
			
 
				+	jobq = create_priority_jobq();
			
 
				+
			
 
				+	init_priority_queues_mechanisms();
			
 
				+
			
 
				+	/* we always use priorities in that policy */
			
 
				+	jobq->push_task = priority_push_task;
			
 
				+	jobq->push_prio_task = priority_push_task;
			
 
				+	jobq->pop_task = priority_pop_task;
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *func_init_priority_queue(void)
			
 
				+{
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+void initialize_eager_center_priority_policy(struct machine_config_s *config, 
			
 
				+			__attribute__ ((unused))	struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	setup_queues(init_priority_queue_design, func_init_priority_queue, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_eager_priority(struct sched_policy_s *policy __attribute__ ((unused)))
			
 
				+{
			
 
				+	/* this is trivial for that strategy */
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/core/policies/eager-central-priority-policy.h
+++ b/src/core/policies/eager-central-priority-policy.h
@@ -0,0 +1,28 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __EAGER_CENTRAL_PRIORITY_POLICY_H__
			
 
				+#define __EAGER_CENTRAL_PRIORITY_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+#include <core/mechanisms/priority_queues.h>
			
 
				+
			
 
				+void initialize_eager_center_priority_policy(struct machine_config_s *config, struct sched_policy_s *policy);
			
 
				+void set_local_queue_eager_priority(struct jobq_s *jobq);
			
 
				+struct jobq_s *get_local_queue_eager_priority(struct sched_policy_s *policy);
			
 
				+
			
 
				+#endif // __EAGER_CENTRAL_PRIORITY_POLICY_H__
			
--- a/src/core/policies/no-prio-policy.c
+++ b/src/core/policies/no-prio-policy.c
@@ -0,0 +1,57 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/no-prio-policy.h>
			
 
				+
			
 
				+/*
			
 
				+ *	This is just the trivial policy where every worker use the same
			
 
				+ *	JOB QUEUE.
			
 
				+ */
			
 
				+
			
 
				+/* the former is the actual queue, the latter some container */
			
 
				+static struct jobq_s *jobq;
			
 
				+
			
 
				+static void init_no_prio_design(void)
			
 
				+{
			
 
				+	/* there is only a single queue in that trivial design */
			
 
				+	jobq = create_fifo();
			
 
				+
			
 
				+	init_fifo_queues_mechanisms();
			
 
				+
			
 
				+	jobq->push_task = fifo_push_task;
			
 
				+	/* no priority in that policy, let's be stupid here */
			
 
				+	jobq->push_prio_task = fifo_push_task;
			
 
				+	jobq->pop_task = fifo_pop_task;
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *func_init_central_queue(void)
			
 
				+{
			
 
				+	/* once again, this is trivial */
			
 
				+	return jobq;
			
 
				+}
			
 
				+
			
 
				+void initialize_no_prio_policy(struct machine_config_s *config, 
			
 
				+	   __attribute__ ((unused)) struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	setup_queues(init_no_prio_design, func_init_central_queue, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_no_prio(struct sched_policy_s *policy 
			
 
				+					__attribute__ ((unused)))
			
 
				+{
			
 
				+	/* this is trivial for that strategy :) */
			
 
				+	return jobq;
			
 
				+}
			
--- a/src/core/policies/no-prio-policy.h
+++ b/src/core/policies/no-prio-policy.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __NO_PRIO_POLICY_H__
			
 
				+#define __NO_PRIO_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+
			
 
				+void initialize_no_prio_policy(struct machine_config_s *config, struct sched_policy_s *policy);
			
 
				+//void set_local_queue_eager(struct jobq_s *jobq);
			
 
				+struct jobq_s *get_local_queue_no_prio(struct sched_policy_s *policy);
			
 
				+
			
 
				+#endif // __NO_PRIO_POLICY_H__
			
--- a/src/core/policies/random-policy.c
+++ b/src/core/policies/random-policy.c
@@ -0,0 +1,121 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/random-policy.h>
			
 
				+
			
 
				+/* XXX 32 is set randomly */
			
 
				+static unsigned nworkers;
			
 
				+static struct jobq_s *queue_array[32];
			
 
				+
			
 
				+static job_t random_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	struct job_s *j;
			
 
				+
			
 
				+	j = fifo_pop_task(q);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+static int _random_push_task(struct jobq_s *q __attribute__ ((unused)), job_t task, unsigned prio)
			
 
				+{
			
 
				+	/* find the queue */
			
 
				+	struct fifo_jobq_s *fifo;
			
 
				+	unsigned worker;
			
 
				+
			
 
				+	unsigned selected = 0;
			
 
				+
			
 
				+	double alpha_sum = 0.0;
			
 
				+
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		alpha_sum += queue_array[worker]->alpha;
			
 
				+	}
			
 
				+
			
 
				+	double rand = drand48()*alpha_sum;
			
 
				+//	fprintf(stderr, "my rand is %e\n", rand);
			
 
				+
			
 
				+	double alpha = 0.0;
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		if (alpha + queue_array[worker]->alpha > rand) {
			
 
				+			/* we found the worker */
			
 
				+			selected = worker;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		alpha += queue_array[worker]->alpha;
			
 
				+	}
			
 
				+
			
 
				+	/* we should now have the best worker in variable "best" */
			
 
				+	fifo = queue_array[selected]->queue;
			
 
				+
			
 
				+	if (prio) {
			
 
				+		return fifo_push_prio_task(queue_array[selected], task);
			
 
				+	} else {
			
 
				+		return fifo_push_task(queue_array[selected], task);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int random_push_prio_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	return _random_push_task(q, task, 1);
			
 
				+}
			
 
				+
			
 
				+static int random_push_task(struct jobq_s *q, job_t task)
			
 
				+{
			
 
				+	return _random_push_task(q, task, 0);
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *init_random_fifo(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = create_fifo();
			
 
				+
			
 
				+	q->push_task = random_push_task; 
			
 
				+	q->push_prio_task = random_push_prio_task; 
			
 
				+	q->pop_task = random_pop_task;
			
 
				+	q->who = 0;
			
 
				+
			
 
				+	queue_array[nworkers++] = q;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+void initialize_random_policy(struct machine_config_s *config, 
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	nworkers = 0;
			
 
				+
			
 
				+	srand48(time(NULL));
			
 
				+
			
 
				+	setup_queues(init_fifo_queues_mechanisms, init_random_fifo, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_random(struct sched_policy_s *policy __attribute__ ((unused)))
			
 
				+{
			
 
				+	struct jobq_s *queue;
			
 
				+	queue = pthread_getspecific(policy->local_queue_key);
			
 
				+
			
 
				+	if (!queue)
			
 
				+	{
			
 
				+		/* take one randomly as this *must* be for a push anyway XXX */
			
 
				+		queue = queue_array[0];
			
 
				+	}
			
 
				+
			
 
				+	return queue;
			
 
				+}
			
 
				+
			
--- a/src/core/policies/random-policy.h
+++ b/src/core/policies/random-policy.h
@@ -0,0 +1,29 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __RANDOM_POLICY_H__
			
 
				+#define __RANDOM_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+#include <core/mechanisms/fifo_queues.h>
			
 
				+
			
 
				+void initialize_random_policy(struct machine_config_s *config,
			
 
				+ __attribute__ ((unused)) struct sched_policy_s *_policy);
			
 
				+
			
 
				+struct jobq_s *get_local_queue_random(struct sched_policy_s *policy __attribute__ ((unused)));
			
 
				+
			
 
				+#endif // __RANDOM_POLICY_H__
			
--- a/src/core/policies/sched_policy.c
+++ b/src/core/policies/sched_policy.c
@@ -0,0 +1,159 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+#include <core/policies/no-prio-policy.h>
			
 
				+#include <core/policies/eager-central-policy.h>
			
 
				+#include <core/policies/eager-central-priority-policy.h>
			
 
				+#include <core/policies/work-stealing-policy.h>
			
 
				+#include <core/policies/deque-modeling-policy.h>
			
 
				+#include <core/policies/random-policy.h>
			
 
				+#include <core/policies/deque-modeling-policy-data-aware.h>
			
 
				+
			
 
				+
			
 
				+static struct sched_policy_s policy;
			
 
				+
			
 
				+struct sched_policy_s *get_sched_policy(void)
			
 
				+{
			
 
				+	return &policy;
			
 
				+}
			
 
				+
			
 
				+void init_sched_policy(struct machine_config_s *config)
			
 
				+{
			
 
				+	/* eager policy is taken by default */
			
 
				+	char *sched_env;
			
 
				+	sched_env = getenv("SCHED");
			
 
				+	if (sched_env) {
			
 
				+		 if (strcmp(sched_env, "ws") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE WS SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_ws_policy;
			
 
				+			policy.get_local_queue = get_local_queue_ws;
			
 
				+		 }
			
 
				+		 else if (strcmp(sched_env, "prio") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE PRIO EAGER SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_eager_center_priority_policy;
			
 
				+			policy.get_local_queue = get_local_queue_eager_priority;
			
 
				+		 }
			
 
				+		 else if (strcmp(sched_env, "no-prio") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE _NO_ PRIO EAGER SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_no_prio_policy;
			
 
				+			policy.get_local_queue = get_local_queue_no_prio;
			
 
				+		 }
			
 
				+		 else if (strcmp(sched_env, "dm") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE MODEL SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_dm_policy;
			
 
				+			policy.get_local_queue = get_local_queue_dm;
			
 
				+		 }
			
 
				+		 else if (strcmp(sched_env, "dmda") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE DATA AWARE MODEL SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_dmda_policy;
			
 
				+			policy.get_local_queue = get_local_queue_dmda;
			
 
				+		 }
			
 
				+		 else if (strcmp(sched_env, "random") == 0) {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE RANDOM SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			policy.init_sched = initialize_random_policy;
			
 
				+			policy.get_local_queue = get_local_queue_random;
			
 
				+		 }
			
 
				+		 else {
			
 
				+#ifdef VERBOSE
			
 
				+		 	fprintf(stderr, "USE EAGER SCHEDULER !! \n");
			
 
				+#endif
			
 
				+			/* default scheduler is the eager one */
			
 
				+			policy.init_sched = initialize_eager_center_policy;
			
 
				+			policy.get_local_queue = get_local_queue_eager;
			
 
				+		 }
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef VERBOSE
			
 
				+	 	fprintf(stderr, "USE EAGER SCHEDULER !! \n");
			
 
				+#endif
			
 
				+		/* default scheduler is the eager one */
			
 
				+		policy.init_sched = initialize_eager_center_policy;
			
 
				+		policy.get_local_queue = get_local_queue_eager;
			
 
				+	}
			
 
				+
			
 
				+	pthread_cond_init(&policy.sched_activity_cond, NULL);
			
 
				+	pthread_mutex_init(&policy.sched_activity_mutex, NULL);
			
 
				+	pthread_key_create(&policy.local_queue_key, NULL);
			
 
				+
			
 
				+	policy.init_sched(config, &policy);
			
 
				+}
			
 
				+
			
 
				+/* the generic interface that call the proper underlying implementation */
			
 
				+int push_task(job_t task)
			
 
				+{
			
 
				+	struct jobq_s *queue = policy.get_local_queue(&policy);
			
 
				+
			
 
				+	STARPU_ASSERT(queue->push_task);
			
 
				+
			
 
				+	return queue->push_task(queue, task);
			
 
				+}
			
 
				+
			
 
				+struct job_s * pop_task_from_queue(struct jobq_s *queue)
			
 
				+{
			
 
				+	STARPU_ASSERT(queue->pop_task);
			
 
				+
			
 
				+	struct job_s *j = queue->pop_task(queue);
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+struct job_s * pop_task(void)
			
 
				+{
			
 
				+	struct jobq_s *queue = policy.get_local_queue(&policy);
			
 
				+
			
 
				+	return pop_task_from_queue(queue);
			
 
				+}
			
 
				+
			
 
				+struct job_list_s * pop_every_task_from_queue(struct jobq_s *queue)
			
 
				+{
			
 
				+	STARPU_ASSERT(queue->pop_every_task);
			
 
				+
			
 
				+	struct job_list_s *list = queue->pop_every_task(queue);
			
 
				+
			
 
				+	return list;
			
 
				+}
			
 
				+
			
 
				+struct job_list_s *pop_every_task(void)
			
 
				+{
			
 
				+	struct jobq_s *queue = policy.get_local_queue(&policy);
			
 
				+
			
 
				+	return pop_every_task_from_queue(queue);
			
 
				+}
			
 
				+
			
 
				+void wait_on_sched_event(void)
			
 
				+{
			
 
				+	struct jobq_s *q = policy.get_local_queue(&policy);
			
 
				+
			
 
				+	pthread_mutex_lock(&q->activity_mutex);
			
 
				+	pthread_cond_wait(&q->activity_cond, &q->activity_mutex);
			
 
				+	pthread_mutex_unlock(&q->activity_mutex);
			
 
				+}
			
--- a/src/core/policies/sched_policy.h
+++ b/src/core/policies/sched_policy.h
@@ -0,0 +1,56 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __SCHED_POLICY_H__
			
 
				+#define __SCHED_POLICY_H__
			
 
				+
			
 
				+#include <core/mechanisms/queues.h>
			
 
				+//#include <core/mechanisms/work_stealing_queues.h>
			
 
				+//#include <core/mechanisms/central_queues.h>
			
 
				+//#include <core/mechanisms/central_queues_priorities.h>
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+
			
 
				+struct machine_config_s;
			
 
				+
			
 
				+struct sched_policy_s {
			
 
				+	/* create all the queues */
			
 
				+	void (*init_sched)(struct machine_config_s *, struct sched_policy_s *);
			
 
				+
			
 
				+	/* anyone can request which queue it is associated to */
			
 
				+	struct jobq_s *(*get_local_queue)(struct sched_policy_s *);
			
 
				+
			
 
				+	/* some worker may block until some activity happens in the machine */
			
 
				+	pthread_cond_t sched_activity_cond;
			
 
				+	pthread_mutex_t sched_activity_mutex;
			
 
				+
			
 
				+	pthread_key_t local_queue_key;
			
 
				+};
			
 
				+
			
 
				+struct sched_policy_s *get_sched_policy(void);
			
 
				+
			
 
				+void init_sched_policy(struct machine_config_s *config);
			
 
				+//void set_local_queue(struct jobq_s *jobq);
			
 
				+
			
 
				+int push_task(job_t task);
			
 
				+struct job_s *pop_task(void);
			
 
				+struct job_s *pop_task_from_queue(struct jobq_s *queue);
			
 
				+struct job_list_s *pop_every_task(void);
			
 
				+struct job_list_s * pop_every_task_from_queue(struct jobq_s *queue);
			
 
				+
			
 
				+void wait_on_sched_event(void);
			
 
				+
			
 
				+#endif // __SCHED_POLICY_H__
			
--- a/src/core/policies/work-stealing-policy.c
+++ b/src/core/policies/work-stealing-policy.c
@@ -0,0 +1,201 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/policies/work-stealing-policy.h>
			
 
				+
			
 
				+/* save the general machine configuration */
			
 
				+//static struct machine_config_s *machineconfig;
			
 
				+
			
 
				+/* XXX 32 is set randomly */
			
 
				+static unsigned nworkers;
			
 
				+static unsigned rr_worker;
			
 
				+static struct jobq_s *queue_array[32];
			
 
				+
			
 
				+/* keep track of the work performed from the beginning of the algorithm to make
			
 
				+ * better decisions about which queue to select when stealing or deferring work
			
 
				+ */
			
 
				+static unsigned performed_total = 0;
			
 
				+//static unsigned performed_local[16];
			
 
				+
			
 
				+#ifdef USE_OVERLOAD
			
 
				+static float overload_metric(unsigned id)
			
 
				+{
			
 
				+	float execution_ratio = 0.0f;
			
 
				+	if (performed_total > 0) {
			
 
				+		execution_ratio = get_deque_nprocessed(queue_array[id])/performed_total;
			
 
				+	}
			
 
				+
			
 
				+	unsigned performed_queue;
			
 
				+	performed_queue = get_deque_nprocessed(queue_array[id]);
			
 
				+
			
 
				+	float current_ratio = 0.0f;
			
 
				+	if (performed_queue > 0) {
			
 
				+		current_ratio = get_deque_njobs(queue_array[id])/performed_queue;
			
 
				+	}
			
 
				+	
			
 
				+	return (current_ratio - execution_ratio);
			
 
				+}
			
 
				+
			
 
				+/* who to steal work to ? */
			
 
				+static struct jobq_s *select_victimq(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	unsigned attempts = nworkers;
			
 
				+
			
 
				+	unsigned worker = rr_worker;
			
 
				+	do {
			
 
				+		if (overload_metric(worker) > 0.0f)
			
 
				+		{
			
 
				+			q = queue_array[worker];
			
 
				+			return q;
			
 
				+		}
			
 
				+		else {
			
 
				+			worker = (worker + 1)%nworkers;
			
 
				+		}
			
 
				+	} while(attempts-- > 0);
			
 
				+
			
 
				+	/* take one anyway ... */
			
 
				+	q = queue_array[rr_worker];
			
 
				+	rr_worker = (rr_worker + 1 )%nworkers;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *select_workerq(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	unsigned attempts = nworkers;
			
 
				+
			
 
				+	unsigned worker = rr_worker;
			
 
				+	do {
			
 
				+		if (overload_metric(worker) < 0.0f)
			
 
				+		{
			
 
				+			q = queue_array[worker];
			
 
				+			return q;
			
 
				+		}
			
 
				+		else {
			
 
				+			worker = (worker + 1)%nworkers;
			
 
				+		}
			
 
				+	} while(attempts-- > 0);
			
 
				+
			
 
				+	/* take one anyway ... */
			
 
				+	q = queue_array[rr_worker];
			
 
				+	rr_worker = (rr_worker + 1 )%nworkers;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+/* who to steal work to ? */
			
 
				+static struct jobq_s *select_victimq(void)
			
 
				+{
			
 
				+
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = queue_array[rr_worker];
			
 
				+
			
 
				+	rr_worker = (rr_worker + 1 )%nworkers;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* when anonymous threads submit tasks, 
			
 
				+ * we need to select a queue where to dispose them */
			
 
				+static struct jobq_s *select_workerq(void)
			
 
				+{
			
 
				+
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = queue_array[rr_worker];
			
 
				+
			
 
				+	rr_worker = (rr_worker + 1 )%nworkers;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+static job_t ws_pop_task(struct jobq_s *q)
			
 
				+{
			
 
				+	job_t j;
			
 
				+
			
 
				+	j = deque_non_blocking_pop_task(q);
			
 
				+	if (j) {
			
 
				+		/* there was a local task */
			
 
				+		performed_total++;
			
 
				+		return j;
			
 
				+	}
			
 
				+	
			
 
				+	/* we need to steal someone's job */
			
 
				+	struct jobq_s *victimq;
			
 
				+	victimq = select_victimq();
			
 
				+
			
 
				+	j = deque_non_blocking_pop_task_if_job_exists(victimq);
			
 
				+
			
 
				+	if (j)
			
 
				+	{
			
 
				+		TRACE_WORK_STEALING(q, j);
			
 
				+		performed_total++;
			
 
				+	}
			
 
				+
			
 
				+	return j;
			
 
				+}
			
 
				+
			
 
				+static struct jobq_s *init_ws_deque(void)
			
 
				+{
			
 
				+	struct jobq_s *q;
			
 
				+
			
 
				+	q = create_deque();
			
 
				+
			
 
				+	q->push_task = deque_push_task; 
			
 
				+	q->push_prio_task = deque_push_prio_task; 
			
 
				+	q->pop_task = ws_pop_task;
			
 
				+	q->who = 0;
			
 
				+
			
 
				+	queue_array[nworkers++] = q;
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+void initialize_ws_policy(struct machine_config_s *config, 
			
 
				+			__attribute__ ((unused))	struct sched_policy_s *_policy) 
			
 
				+{
			
 
				+	nworkers = 0;
			
 
				+	rr_worker = 0;
			
 
				+
			
 
				+	//machineconfig = config;
			
 
				+
			
 
				+	setup_queues(init_deque_queues_mechanisms, init_ws_deque, config);
			
 
				+}
			
 
				+
			
 
				+struct jobq_s *get_local_queue_ws(struct sched_policy_s *policy __attribute__ ((unused)))
			
 
				+{
			
 
				+	struct jobq_s *queue;
			
 
				+	queue = pthread_getspecific(policy->local_queue_key);
			
 
				+
			
 
				+	if (!queue) {
			
 
				+		queue = select_workerq();
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT(queue);
			
 
				+
			
 
				+	return queue;
			
 
				+}
			
 
				+
			
--- a/src/core/policies/work-stealing-policy.h
+++ b/src/core/policies/work-stealing-policy.h
@@ -0,0 +1,26 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __WORK_STEALING_POLICY_H__
			
 
				+#define __WORK_STEALING_POLICY_H__
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <core/mechanisms/deque_queues.h>
			
 
				+
			
 
				+void initialize_ws_policy(struct machine_config_s *config, struct sched_policy_s *policy);
			
 
				+struct jobq_s *get_local_queue_ws(struct sched_policy_s *policy);
			
 
				+
			
 
				+#endif // __WORK_STEALING_POLICY_H__
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -0,0 +1,392 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+
			
 
				+/* XXX quick and dirty implementation for now ... */
			
 
				+pthread_key_t local_workers_key;
			
 
				+
			
 
				+static struct machine_config_s config;
			
 
				+
			
 
				+/* in case a task is submitted, we may check whether there exists a worker
			
 
				+   that may execute the task or not */
			
 
				+static uint32_t worker_mask = 0;
			
 
				+
			
 
				+inline uint32_t worker_exists(uint32_t task_mask)
			
 
				+{
			
 
				+	return (task_mask & worker_mask);
			
 
				+} 
			
 
				+
			
 
				+inline uint32_t may_submit_cuda_task(void)
			
 
				+{
			
 
				+	return ((CUDA|CUBLAS) & worker_mask);
			
 
				+}
			
 
				+
			
 
				+inline uint32_t may_submit_core_task(void)
			
 
				+{
			
 
				+	return (CORE & worker_mask);
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CPUS
			
 
				+static unsigned ncores;
			
 
				+#endif
			
 
				+#ifdef USE_CUDA
			
 
				+static unsigned ncudagpus;
			
 
				+#endif
			
 
				+#ifdef USE_GORDON
			
 
				+static unsigned ngordon_spus;
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * Runtime initialization methods
			
 
				+ */
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+extern unsigned get_cuda_device_count(void);
			
 
				+#endif
			
 
				+
			
 
				+static void init_machine_config(struct machine_config_s *config)
			
 
				+{
			
 
				+	int envval __attribute__((unused));
			
 
				+	unsigned use_accelerator = 0;
			
 
				+
			
 
				+	config->nworkers = 0;
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+	/* we need to initialize CUDA early to count the number of devices */
			
 
				+	init_cuda();
			
 
				+
			
 
				+	envval = starpu_get_env_number("NCUDA");
			
 
				+	if (envval < 0) {
			
 
				+		ncudagpus = STARPU_MIN(get_cuda_device_count(), MAXCUDADEVS);
			
 
				+	} else {
			
 
				+		/* use the specified value */
			
 
				+		ncudagpus = (unsigned)envval;
			
 
				+		STARPU_ASSERT(ncudagpus <= MAXCUDADEVS);
			
 
				+	}
			
 
				+	STARPU_ASSERT(ncudagpus + config->nworkers <= NMAXWORKERS);
			
 
				+
			
 
				+	if (ncudagpus > 0)
			
 
				+		use_accelerator = 1;
			
 
				+
			
 
				+	unsigned cudagpu;
			
 
				+	for (cudagpu = 0; cudagpu < ncudagpus; cudagpu++)
			
 
				+	{
			
 
				+		config->workers[config->nworkers + cudagpu].arch = CUDA_WORKER;
			
 
				+		config->workers[config->nworkers + cudagpu].perf_arch = STARPU_CUDA_DEFAULT;
			
 
				+		config->workers[config->nworkers + cudagpu].id = cudagpu;
			
 
				+		worker_mask |= (CUDA|CUBLAS);
			
 
				+	}
			
 
				+
			
 
				+	config->nworkers += ncudagpus;
			
 
				+#endif
			
 
				+	
			
 
				+#ifdef USE_GORDON
			
 
				+	envval = starpu_get_env_number("NGORDON");
			
 
				+	if (envval < 0) {
			
 
				+		ngordon_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
			
 
				+	} else {
			
 
				+		/* use the specified value */
			
 
				+		ngordon_spus = (unsigned)envval;
			
 
				+		STARPU_ASSERT(ngordon_spus <= NMAXGORDONSPUS);
			
 
				+	}
			
 
				+	STARPU_ASSERT(ngordon_spus + config->nworkers <= NMAXWORKERS);
			
 
				+
			
 
				+	if (ngordon_spus > 0)
			
 
				+		use_accelerator = 1;
			
 
				+
			
 
				+	unsigned spu;
			
 
				+	for (spu = 0; spu < ngordon_spus; spu++)
			
 
				+	{
			
 
				+		config->workers[config->nworkers + spu].arch = GORDON_WORKER;
			
 
				+		config->workers[config->nworkers + spu].perf_arch = STARPU_GORDON_DEFAULT;
			
 
				+		config->workers[config->nworkers + spu].id = spu;
			
 
				+		config->workers[config->nworkers + spu].worker_is_running = 0;
			
 
				+		worker_mask |= GORDON;
			
 
				+	}
			
 
				+
			
 
				+	config->nworkers += ngordon_spus;
			
 
				+#endif
			
 
				+
			
 
				+/* we put the CPU section after the accelerator : in case there was an
			
 
				+ * accelerator found, we devote one core */
			
 
				+#ifdef USE_CPUS
			
 
				+	envval = starpu_get_env_number("NCPUS");
			
 
				+	if (envval < 0) {
			
 
				+		long avail_cores = sysconf(_SC_NPROCESSORS_ONLN) 
			
 
				+						- (use_accelerator?1:0);
			
 
				+		ncores = STARPU_MIN(avail_cores, NMAXCORES);
			
 
				+	} else {
			
 
				+		/* use the specified value */
			
 
				+		ncores = (unsigned)envval;
			
 
				+		STARPU_ASSERT(ncores <= NMAXCORES);
			
 
				+	}
			
 
				+	STARPU_ASSERT(ncores + config->nworkers <= NMAXWORKERS);
			
 
				+
			
 
				+	unsigned core;
			
 
				+	for (core = 0; core < ncores; core++)
			
 
				+	{
			
 
				+		config->workers[config->nworkers + core].arch = CORE_WORKER;
			
 
				+		config->workers[config->nworkers + core].perf_arch = STARPU_CORE_DEFAULT;
			
 
				+		config->workers[config->nworkers + core].id = core;
			
 
				+		worker_mask |= CORE;
			
 
				+	}
			
 
				+
			
 
				+	config->nworkers += ncores;
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	if (config->nworkers == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "No worker found, aborting ...\n");
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_workers_binding(struct machine_config_s *config)
			
 
				+{
			
 
				+	/* launch one thread per CPU */
			
 
				+	unsigned ram_memory_node;
			
 
				+
			
 
				+	int current_bindid = 0;
			
 
				+
			
 
				+	/* a single core is dedicated for the accelerators */
			
 
				+	int accelerator_bindid = -1;
			
 
				+
			
 
				+	/* note that even if the CPU core are not used, we always have a RAM node */
			
 
				+	/* TODO : support NUMA  ;) */
			
 
				+	ram_memory_node = register_memory_node(RAM);
			
 
				+
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < config->nworkers; worker++)
			
 
				+	{
			
 
				+		unsigned memory_node = -1;
			
 
				+		unsigned is_an_accelerator = 0;
			
 
				+		struct worker_s *workerarg = &config->workers[worker];
			
 
				+		
			
 
				+		/* select the memory node that contains worker's memory */
			
 
				+		switch (workerarg->arch) {
			
 
				+			case CORE_WORKER:
			
 
				+			/* "dedicate" a cpu core to that worker */
			
 
				+				is_an_accelerator = 0;
			
 
				+				memory_node = ram_memory_node;
			
 
				+				break;
			
 
				+#ifdef USE_GORDON
			
 
				+			case GORDON_WORKER:
			
 
				+				is_an_accelerator = 1;
			
 
				+				memory_node = ram_memory_node;
			
 
				+				break;
			
 
				+#endif
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_WORKER:
			
 
				+				is_an_accelerator = 1;
			
 
				+				memory_node = register_memory_node(CUDA_RAM);
			
 
				+				break;
			
 
				+#endif
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+		}
			
 
				+
			
 
				+		if (is_an_accelerator) {
			
 
				+			if (accelerator_bindid == -1)
			
 
				+				accelerator_bindid = (current_bindid++) % (sysconf(_SC_NPROCESSORS_ONLN));
			
 
				+			workerarg->bindid = accelerator_bindid;
			
 
				+		}
			
 
				+		else {
			
 
				+			workerarg->bindid = (current_bindid++) % (sysconf(_SC_NPROCESSORS_ONLN));
			
 
				+		}
			
 
				+
			
 
				+		workerarg->memory_node = memory_node;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+unsigned gordon_inited = 0;	
			
 
				+struct worker_set_s gordon_worker_set;
			
 
				+#endif
			
 
				+
			
 
				+static void init_workers(struct machine_config_s *config)
			
 
				+{
			
 
				+	config->running = 1;
			
 
				+
			
 
				+	pthread_key_create(&local_workers_key, NULL);
			
 
				+
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < config->nworkers; worker++)
			
 
				+	{
			
 
				+		struct worker_s *workerarg = &config->workers[worker];
			
 
				+
			
 
				+		sem_init(&workerarg->ready_sem, 0, 0);
			
 
				+
			
 
				+		/* if some codelet's termination cannot be handled directly :
			
 
				+		 * for instance in the Gordon driver, Gordon tasks' callbacks
			
 
				+		 * may be executed by another thread than that of the Gordon
			
 
				+		 * driver so that we cannot call the push_codelet_output method
			
 
				+		 * directly */
			
 
				+		workerarg->terminated_jobs = job_list_new();
			
 
				+	
			
 
				+		switch (workerarg->arch) {
			
 
				+#ifdef USE_CPUS
			
 
				+			case CORE_WORKER:
			
 
				+				workerarg->set = NULL;
			
 
				+				pthread_create(&workerarg->worker_thread, 
			
 
				+						NULL, core_worker, workerarg);
			
 
				+				sem_wait(&workerarg->ready_sem);
			
 
				+				break;
			
 
				+#endif
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_WORKER:
			
 
				+				workerarg->set = NULL;
			
 
				+				pthread_create(&workerarg->worker_thread, 
			
 
				+						NULL, cuda_worker, workerarg);
			
 
				+				sem_wait(&workerarg->ready_sem);
			
 
				+				break;
			
 
				+#endif
			
 
				+#ifdef USE_GORDON
			
 
				+			case GORDON_WORKER:
			
 
				+				/* we will only launch gordon once, but it will handle 
			
 
				+				 * the different SPU workers */
			
 
				+				if (!gordon_inited)
			
 
				+				{
			
 
				+					gordon_worker_set.nworkers = ngordon_spus; 
			
 
				+					gordon_worker_set.workers = &config->workers[worker];
			
 
				+
			
 
				+					pthread_create(&gordon_worker_set.worker_thread, NULL, 
			
 
				+							gordon_worker, &gordon_worker_set);
			
 
				+					sem_wait(&gordon_worker_set.ready_sem);
			
 
				+
			
 
				+					gordon_inited = 1;
			
 
				+				}
			
 
				+				
			
 
				+				workerarg->set = &gordon_worker_set;
			
 
				+				gordon_worker_set.joined = 0;
			
 
				+				workerarg->worker_is_running = 1;
			
 
				+
			
 
				+				break;
			
 
				+#endif
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_init(void)
			
 
				+{
			
 
				+	srand(2008);
			
 
				+
			
 
				+#ifdef USE_FXT
			
 
				+	start_fxt_profiling();
			
 
				+#endif
			
 
				+
			
 
				+	timing_init();
			
 
				+
			
 
				+	init_machine_config(&config);
			
 
				+
			
 
				+	/* for the data wizard */
			
 
				+	init_memory_nodes();
			
 
				+
			
 
				+	init_workers_binding(&config);
			
 
				+
			
 
				+	/* initialize the scheduler */
			
 
				+
			
 
				+	/* initialize the queue containing the jobs */
			
 
				+	init_sched_policy(&config);
			
 
				+
			
 
				+	init_workers(&config);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle runtime termination 
			
 
				+ */
			
 
				+
			
 
				+void terminate_workers(struct machine_config_s *config)
			
 
				+{
			
 
				+	int status;
			
 
				+	unsigned workerid;
			
 
				+
			
 
				+	for (workerid = 0; workerid < config->nworkers; workerid++)
			
 
				+	{
			
 
				+		wake_all_blocked_workers();
			
 
				+		
			
 
				+#ifdef VERBOSE
			
 
				+		fprintf(stderr, "wait for worker %d\n", workerid);
			
 
				+#endif
			
 
				+
			
 
				+		struct worker_set_s *set = config->workers[workerid].set;
			
 
				+
			
 
				+		/* in case StarPU termination code is called from a callback,
			
 
				+ 		 * we have to check if pthread_self() is the worker itself */
			
 
				+		if (set){ 
			
 
				+			if (!set->joined) {
			
 
				+				if (pthread_self() != set->worker_thread)
			
 
				+				{
			
 
				+					status = pthread_join(set->worker_thread, NULL);
			
 
				+#ifdef VERBOSE
			
 
				+					if (status)
			
 
				+						fprintf(stderr, "pthread_join -> %d\n", status);
			
 
				+#endif
			
 
				+				}
			
 
				+
			
 
				+				set->joined = 1;
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			struct worker_s *worker = &config->workers[workerid];
			
 
				+			if (pthread_self() != worker->worker_thread)
			
 
				+			{
			
 
				+				status = pthread_join(worker->worker_thread, NULL);
			
 
				+#ifdef VERBOSE
			
 
				+				if (status)
			
 
				+					fprintf(stderr, "pthread_join -> %d\n", status);
			
 
				+#endif
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned machine_is_running(void)
			
 
				+{
			
 
				+	return config.running;
			
 
				+}
			
 
				+
			
 
				+void kill_all_workers(struct machine_config_s *config)
			
 
				+{
			
 
				+	/* set the flag which will tell workers to stop */
			
 
				+	config->running = 0;
			
 
				+
			
 
				+	/* in case some workers are waiting on some event 
			
 
				+	   wake them up ... */
			
 
				+	wake_all_blocked_workers();
			
 
				+}
			
 
				+
			
 
				+void starpu_shutdown(void)
			
 
				+{
			
 
				+	display_msi_stats();
			
 
				+	display_alloc_cache_stats();
			
 
				+
			
 
				+	/* tell all workers to shutdown */
			
 
				+	kill_all_workers(&config);
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+	display_comm_ammounts();
			
 
				+#endif
			
 
				+
			
 
				+	if (starpu_get_env_number("CALIBRATE") != -1)
			
 
				+		dump_registered_models();
			
 
				+
			
 
				+	/* wait for their termination */
			
 
				+	terminate_workers(&config);
			
 
				+}
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -0,0 +1,110 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __WORKERS_H__
			
 
				+#define __WORKERS_H__
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdint.h>
			
 
				+#include <unistd.h>
			
 
				+#include <string.h>
			
 
				+#include <common/config.h>
			
 
				+#include <pthread.h>
			
 
				+#include <common/timing.h>
			
 
				+#include <common/fxt.h>
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <drivers/cuda/driver_cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+#include <drivers/gordon/driver_gordon.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <drivers/core/driver_core.h>
			
 
				+
			
 
				+#include <datawizard/datawizard.h>
			
 
				+
			
 
				+#define CORE_ALPHA	1.0f
			
 
				+#define CUDA_ALPHA	13.33f
			
 
				+#define GORDON_ALPHA	6.0f /* XXX this is a random value ... */
			
 
				+
			
 
				+#define NMAXWORKERS	16
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+#define BENCHMARK_COMM	1
			
 
				+#else
			
 
				+#define BENCHMARK_COMM	0
			
 
				+#endif
			
 
				+
			
 
				+enum archtype {
			
 
				+	CORE_WORKER,
			
 
				+	CUDA_WORKER,
			
 
				+	GORDON_WORKER
			
 
				+};
			
 
				+
			
 
				+struct worker_s {
			
 
				+	enum archtype arch; /* what is the type of worker ? */
			
 
				+	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
			
 
				+	pthread_t worker_thread; /* the thread which runs the worker */
			
 
				+	int id; /* which core/gpu/etc is controlled by the workker ? */
			
 
				+        sem_t ready_sem; /* indicate when the worker is ready */
			
 
				+	int bindid; /* which core is the driver bound to ? */
			
 
				+	unsigned memory_node; /* which memory node is associated that worker to ? */
			
 
				+	struct jobq_s *jobq; /* in which queue will that worker get/put tasks ? */
			
 
				+	struct worker_set_s *set; /* in case this worker belongs to a set */
			
 
				+	struct job_list_s *terminated_jobs; /* list of pending jobs which were executed */
			
 
				+	unsigned worker_is_running;
			
 
				+};
			
 
				+
			
 
				+/* in case a single CPU worker may control multiple 
			
 
				+ * accelerators (eg. Gordon for n SPUs) */
			
 
				+struct worker_set_s {
			
 
				+	pthread_t worker_thread; /* the thread which runs the worker */
			
 
				+	unsigned nworkers;
			
 
				+	unsigned joined; /* only one thread may call pthread_join*/
			
 
				+	void *retval;
			
 
				+	struct worker_s *workers;
			
 
				+        sem_t ready_sem; /* indicate when the worker is ready */
			
 
				+};
			
 
				+
			
 
				+struct machine_config_s {
			
 
				+	unsigned nworkers;
			
 
				+
			
 
				+	struct worker_s workers[NMAXWORKERS];
			
 
				+
			
 
				+	/* this flag is set until the runtime is stopped */
			
 
				+	unsigned running;
			
 
				+};
			
 
				+
			
 
				+void terminate_workers(struct machine_config_s *config);
			
 
				+void kill_all_workers(struct machine_config_s *config);
			
 
				+void display_general_stats(void);
			
 
				+
			
 
				+unsigned machine_is_running(void);
			
 
				+
			
 
				+inline uint32_t worker_exists(uint32_t task_mask);
			
 
				+inline uint32_t may_submit_cuda_task(void);
			
 
				+inline uint32_t may_submit_core_task(void);
			
 
				+
			
 
				+
			
 
				+#endif // __WORKERS_H__
			
--- a/src/datawizard/Makefile
+++ b/src/datawizard/Makefile
@@ -0,0 +1,68 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+.PHONY: interfaces common
			
 
				+
			
 
				+OBJS := write_back.o coherency.o data_request.o progress.o copy-driver.o hierarchy.o memalloc.o footprint.o datastats.o
			
 
				+
			
 
				+DWOBJDEPS += ../common/hash.o
			
 
				+DWOBJDEPS += ../common/timing.o 
			
 
				+DWOBJDEPS += ../common/htable32.o 
			
 
				+DWOBJDEPS += ../common/mutex.o 
			
 
				+DWOBJDEPS += ../common/rwlock.o 
			
 
				+DWOBJDEPS += progress.o
			
 
				+DWOBJDEPS += write_back.o
			
 
				+DWOBJDEPS += copy-driver.o
			
 
				+DWOBJDEPS += data_request.o
			
 
				+DWOBJDEPS += coherency.o 
			
 
				+DWOBJDEPS += hierarchy.o 
			
 
				+DWOBJDEPS += memalloc.o
			
 
				+DWOBJDEPS += footprint.o
			
 
				+DWOBJDEPS += interfaces/blas_filters.o
			
 
				+DWOBJDEPS += interfaces/csr_filters.o
			
 
				+DWOBJDEPS += interfaces/bcsr_filters.o
			
 
				+DWOBJDEPS += interfaces/vector_filters.o
			
 
				+DWOBJDEPS += interfaces/blas_interface.o
			
 
				+DWOBJDEPS += interfaces/csr_interface.o
			
 
				+DWOBJDEPS += interfaces/bcsr_interface.o
			
 
				+DWOBJDEPS += interfaces/vector_interface.o
			
 
				+
			
 
				+
			
 
				+all: datawizard.a interfaces $(SPE_TARGET) $(OBJS)
			
 
				+
			
 
				+datawizard.so: common interfaces $(SPE_TARGET) $(OBJS) 
			
 
				+	gcc --shared -o datawizard.so $(DWOBJDEPS)
			
 
				+
			
 
				+datawizard.a: common interfaces $(SPE_TARGET) $(OBJS)
			
 
				+	$(AR) rcs $@ $(DWOBJDEPS)
			
 
				+
			
 
				+common:
			
 
				+	@make -C ../common/
			
 
				+
			
 
				+interfaces:
			
 
				+	@make -C interfaces
			
 
				+
			
 
				+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
			
 
				+%.d: %.c
			
 
				+	$(CC) $(CFLAGS) $< -MM -o $*.d
			
 
				+
			
 
				+-include $(OBJS:.o=.d)
			
 
				+endif
			
 
				+
			
 
				+clean:
			
 
				+	@make -C interfaces clean
			
 
				+	@rm -f *.o *.d *.gcno *.gcda
			
 
				+	@rm -f *.a *.so
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -0,0 +1,395 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/write_back.h>
			
 
				+#include <core/dependencies/data-concurrency.h>
			
 
				+
			
 
				+/* this function will actually copy a valid data into the requesting node */
			
 
				+static int __attribute__((warn_unused_result)) copy_data_to_node(data_state *state, uint32_t requesting_node, 
			
 
				+						 unsigned donotread)
			
 
				+{
			
 
				+	/* first find a valid copy, either a OWNER or a SHARED */
			
 
				+	int ret;
			
 
				+	uint32_t node;
			
 
				+	uint32_t src_node_mask = 0;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		if (state->per_node[node].state != INVALID) {
			
 
				+			/* we found a copy ! */
			
 
				+			src_node_mask |= (1<<node);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* we should have found at least one copy ! */
			
 
				+	STARPU_ASSERT(src_node_mask != 0);
			
 
				+
			
 
				+	ret = driver_copy_data(state, src_node_mask, requesting_node, donotread);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/* this may be called once the data is fetched with header and RW-lock hold */
			
 
				+static void update_data_state(data_state *state, uint32_t requesting_node,
			
 
				+				uint8_t write)
			
 
				+{
			
 
				+	/* the data is present now */
			
 
				+	state->per_node[requesting_node].requested = 0;
			
 
				+
			
 
				+	if (write) {
			
 
				+		/* the requesting node now has the only valid copy */
			
 
				+		uint32_t node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			state->per_node[node].state = INVALID;
			
 
				+		}
			
 
				+		state->per_node[requesting_node].state = OWNER;
			
 
				+	}
			
 
				+	else { /* read only */
			
 
				+		/* there was at least another copy of the data */
			
 
				+		uint32_t node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			if (state->per_node[node].state != INVALID)
			
 
				+				state->per_node[node].state = SHARED;
			
 
				+		}
			
 
				+		state->per_node[requesting_node].state = SHARED;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * This function is called when the data is needed on the local node, this
			
 
				+ * returns a pointer to the local copy 
			
 
				+ *
			
 
				+ *			R 	W 	RW
			
 
				+ *	Owner		OK	OK	OK
			
 
				+ *	Shared		OK	1	1
			
 
				+ *	Invalid		2	3	4
			
 
				+ *
			
 
				+ * case 1 : shared + (read)write : 
			
 
				+ * 	no data copy but shared->Invalid/Owner
			
 
				+ * case 2 : invalid + read : 
			
 
				+ * 	data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
			
 
				+ * case 3 : invalid + write : 
			
 
				+ * 	no data copy + invalid->owner + (owner,shared)->invalid
			
 
				+ * case 4 : invalid + R/W : 
			
 
				+ * 	data copy + if (W) (invalid->owner + owner->invalid) 
			
 
				+ * 		    else (invalid,owner->shared)
			
 
				+ */
			
 
				+
			
 
				+int _fetch_data(data_state *state, uint32_t requesting_node,
			
 
				+			uint8_t read, uint8_t write)
			
 
				+{
			
 
				+	while (take_mutex_try(&state->header_lock)) {
			
 
				+		datawizard_progress(requesting_node);
			
 
				+	}
			
 
				+
			
 
				+	cache_state local_state;
			
 
				+	local_state = state->per_node[requesting_node].state;
			
 
				+
			
 
				+	/* we handle that case first to optimize the OWNER path */
			
 
				+	if ((local_state == OWNER) || (local_state == SHARED && !write))
			
 
				+	{
			
 
				+		/* the local node already got its data */
			
 
				+		release_mutex(&state->header_lock);
			
 
				+		msi_cache_hit(requesting_node);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if ((local_state == SHARED) && write) {
			
 
				+		/* local node already has the data but it must invalidate 
			
 
				+		 * other copies */
			
 
				+		uint32_t node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			if (state->per_node[node].state == SHARED) 
			
 
				+			{
			
 
				+				state->per_node[node].state =
			
 
				+					(node == requesting_node ? OWNER:INVALID);
			
 
				+			}
			
 
				+
			
 
				+		}
			
 
				+		
			
 
				+		release_mutex(&state->header_lock);
			
 
				+		msi_cache_hit(requesting_node);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* the only remaining situation is that the local copy was invalid */
			
 
				+	STARPU_ASSERT(state->per_node[requesting_node].state == INVALID);
			
 
				+
			
 
				+	msi_cache_miss(requesting_node);
			
 
				+
			
 
				+	/* we need the data from either the owner or one of the sharer */
			
 
				+	int ret;
			
 
				+	ret = copy_data_to_node(state, requesting_node, !read);
			
 
				+	if (ret != 0)
			
 
				+	switch (ret) {
			
 
				+		case -ENOMEM:
			
 
				+			goto enomem;
			
 
				+		
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+	}
			
 
				+
			
 
				+	update_data_state(state, requesting_node, write);
			
 
				+
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+enomem:
			
 
				+	/* there was not enough local memory to fetch the data */
			
 
				+	release_mutex(&state->header_lock);
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+int fetch_data(data_state *state, starpu_access_mode mode)
			
 
				+{
			
 
				+	int ret;
			
 
				+	uint32_t requesting_node = get_local_memory_node(); 
			
 
				+
			
 
				+	uint8_t read, write;
			
 
				+	read = (mode != W); /* then R or RW */
			
 
				+	write = (mode != R); /* then W or RW */
			
 
				+
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	if (write) {
			
 
				+//		take_rw_lock_write(&state->data_lock);
			
 
				+		while (take_rw_lock_write_try(&state->data_lock))
			
 
				+			datawizard_progress(requesting_node);
			
 
				+	} else {
			
 
				+//		take_rw_lock_read(&state->data_lock);
			
 
				+		while (take_rw_lock_read_try(&state->data_lock))
			
 
				+			datawizard_progress(requesting_node);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	while (take_mutex_try(&state->header_lock))
			
 
				+		datawizard_progress(requesting_node);
			
 
				+
			
 
				+	state->per_node[requesting_node].refcnt++;
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+	ret = _fetch_data(state, requesting_node, read, write);
			
 
				+	if (ret != 0)
			
 
				+		goto enomem;
			
 
				+
			
 
				+	return 0;
			
 
				+enomem:
			
 
				+	/* we did not get the data so remove the lock anyway */
			
 
				+	while (take_mutex_try(&state->header_lock))
			
 
				+		datawizard_progress(requesting_node);
			
 
				+
			
 
				+	state->per_node[requesting_node].refcnt--;
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	release_rw_lock(&state->data_lock);
			
 
				+#endif
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+uint32_t get_data_refcnt(data_state *state, uint32_t node)
			
 
				+{
			
 
				+	return state->per_node[node].refcnt;
			
 
				+}
			
 
				+
			
 
				+/* in case the data was accessed on a write mode, do not forget to 
			
 
				+ * make it accessible again once it is possible ! */
			
 
				+static void release_data(data_state *state, uint32_t default_wb_mask)
			
 
				+{
			
 
				+	uint32_t wb_mask;
			
 
				+
			
 
				+	/* normally, the requesting node should have the data in an exclusive manner */
			
 
				+	uint32_t requesting_node = get_local_memory_node();
			
 
				+	STARPU_ASSERT(state->per_node[requesting_node].state != INVALID);
			
 
				+
			
 
				+	wb_mask = default_wb_mask | state->wb_mask;
			
 
				+
			
 
				+	/* are we doing write-through or just some normal write-back ? */
			
 
				+	if (wb_mask & ~(1<<requesting_node)) {
			
 
				+		write_through_data(state, requesting_node, wb_mask);
			
 
				+	}
			
 
				+
			
 
				+	while (take_mutex_try(&state->header_lock))
			
 
				+		datawizard_progress(requesting_node);
			
 
				+
			
 
				+	state->per_node[requesting_node].refcnt--;
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	/* this is intended to make data accessible again */
			
 
				+	release_rw_lock(&state->data_lock);
			
 
				+#else
			
 
				+	notify_data_dependencies(state);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+int fetch_codelet_input(starpu_buffer_descr *descrs, starpu_data_interface_t *interface, unsigned nbuffers, uint32_t mask)
			
 
				+{
			
 
				+	TRACE_START_FETCH_INPUT(NULL);
			
 
				+
			
 
				+	uint32_t local_memory_node = get_local_memory_node();
			
 
				+
			
 
				+	unsigned index;
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		int ret;
			
 
				+		starpu_buffer_descr *descr;
			
 
				+		data_state *state;
			
 
				+
			
 
				+		descr = &descrs[index];
			
 
				+
			
 
				+		state = descr->state;
			
 
				+
			
 
				+		ret = fetch_data(state, descr->mode);
			
 
				+		if (STARPU_UNLIKELY(ret))
			
 
				+			goto enomem;
			
 
				+
			
 
				+		memcpy(&interface[index], &state->interface[local_memory_node], 
			
 
				+				sizeof(starpu_data_interface_t));
			
 
				+	}
			
 
				+
			
 
				+	TRACE_END_FETCH_INPUT(NULL);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+enomem:
			
 
				+	/* try to unreference all the input that were successfully taken */
			
 
				+	fprintf(stderr, "something went wrong with buffer %d\n", index);
			
 
				+	push_codelet_output(descrs, index, mask);
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_t mask)
			
 
				+{
			
 
				+	TRACE_START_PUSH_OUTPUT(NULL);
			
 
				+
			
 
				+	unsigned index;
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		release_data(descrs[index].state, mask);
			
 
				+	}
			
 
				+
			
 
				+	TRACE_END_PUSH_OUTPUT(NULL);
			
 
				+}
			
 
				+
			
 
				+int request_data_allocation(data_state *state, uint32_t node)
			
 
				+{
			
 
				+	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	int ret;
			
 
				+	ret = allocate_per_node_buffer(state, node);
			
 
				+	STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+	/* XXX quick and dirty hack */
			
 
				+	state->per_node[node].automatically_allocated = 0;	
			
 
				+
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+/* put the current value of the data into RAM */
			
 
				+static void _starpu_sync_data_with_mem_continuation(void *_state)
			
 
				+{
			
 
				+	int ret;
			
 
				+	data_state *state = _state;
			
 
				+
			
 
				+	ret = fetch_data(state, R);
			
 
				+	
			
 
				+	STARPU_ASSERT(!ret);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void starpu_sync_data_with_mem(data_state *state)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	/* we try to get the data, if we do not succeed immediately, we set a
			
 
				+ 	* callback function that will be executed automatically when the data is
			
 
				+ 	* available again, otherwise we fetch the data directly */
			
 
				+	if (!attempt_to_submit_data_request_from_apps(state, R, _starpu_sync_data_with_mem_continuation, state))
			
 
				+	{
			
 
				+		ret = fetch_data(state, R);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+#else
			
 
				+	ret = fetch_data(state, R);
			
 
				+	STARPU_ASSERT(!ret);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* in case the application did modify the data ... invalidate all other copies  */
			
 
				+void notify_data_modification(data_state *state, uint32_t modifying_node)
			
 
				+{
			
 
				+	/* this may block .. XXX */
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	take_rw_lock_write(&state->data_lock);
			
 
				+#else
			
 
				+#warning notify_data_modification is not supported with NO_DATA_RW_LOCK yet
			
 
				+#endif
			
 
				+
			
 
				+	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	unsigned node = 0;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		state->per_node[node].state =
			
 
				+			(node == modifying_node?OWNER:INVALID);
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&state->header_lock);
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	release_rw_lock(&state->data_lock);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* NB : this value can only be an indication of the status of a data
			
 
				+	at some point, but there is no strong garantee ! */
			
 
				+unsigned is_data_present_or_requested(data_state *state, uint32_t node)
			
 
				+{
			
 
				+	unsigned ret = 0;
			
 
				+
			
 
				+// XXX : this is just a hint, so we don't take the lock ...
			
 
				+//	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	if (state->per_node[node].state != INVALID 
			
 
				+		|| state->per_node[node].requested)
			
 
				+		ret = 1;
			
 
				+
			
 
				+//	release_mutex(&state->header_lock);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node)
			
 
				+{
			
 
				+// XXX : this is just a hint, so we don't take the lock ...
			
 
				+//	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	if (state->per_node[node].state == INVALID) 
			
 
				+		state->per_node[node].requested = 1;
			
 
				+
			
 
				+//	release_mutex(&state->header_lock);
			
 
				+}
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -0,0 +1,160 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __COHERENCY__H__
			
 
				+#define __COHERENCY__H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include <starpu-mutex.h>
			
 
				+#include <common/rwlock.h>
			
 
				+#include <common/timing.h>
			
 
				+#include <common/fxt.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include <datawizard/data_request.h>
			
 
				+#include <datawizard/interfaces/data_interface.h>
			
 
				+#include <datawizard/progress.h>
			
 
				+#include <datawizard/datastats.h>
			
 
				+
			
 
				+typedef enum {
			
 
				+//	MODIFIED,
			
 
				+	OWNER,
			
 
				+	SHARED,
			
 
				+	INVALID
			
 
				+} cache_state;
			
 
				+
			
 
				+/* this should contain the information relative to a given node */
			
 
				+typedef struct local_data_state_t {
			
 
				+	/* describes the state of the local data in term of coherency */
			
 
				+	cache_state	state; 
			
 
				+
			
 
				+	uint32_t refcnt;
			
 
				+
			
 
				+	/* is the data locally allocated ? */
			
 
				+	uint8_t allocated; 
			
 
				+	/* was it automatically allocated ? */
			
 
				+	/* perhaps the allocation was perform higher in the hiearchy 
			
 
				+	 * for now this is just translated into !automatically_allocated
			
 
				+	 * */
			
 
				+	uint8_t automatically_allocated;
			
 
				+
			
 
				+	/* To help the scheduling policies to make some decision, we
			
 
				+	   may keep a track of the tasks that are likely to request 
			
 
				+	   this data on the current node.
			
 
				+	   It is the responsability of the scheduling _policy_ to set that
			
 
				+	   flag when it assigns a task to a queue, policies which do not
			
 
				+	   use this hint can simply ignore it.
			
 
				+	 */
			
 
				+	uint8_t requested;
			
 
				+} local_data_state;
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+/* Everyone that wants to access some piece of data will post a request.
			
 
				+ * Not only StarPU internals, but also the application may put such requests */
			
 
				+
			
 
				+LIST_TYPE(data_requester,
			
 
				+	/* what kind of access is requested ? */
			
 
				+	starpu_access_mode mode;
			
 
				+
			
 
				+	unsigned is_requested_by_codelet;
			
 
				+
			
 
				+	/* in case this is a codelet that will do the access */
			
 
				+	struct job_s *j;
			
 
				+	unsigned buffer_index;
			
 
				+
			
 
				+	/* if this is more complicated ... (eg. application request) 
			
 
				+	 * NB: this callback is not called with the lock taken !
			
 
				+	 */
			
 
				+	void (*ready_data_callback)(void *argcb);
			
 
				+	void *argcb;
			
 
				+);
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+typedef struct starpu_data_state_t {
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	data_requester_list_t req_list;
			
 
				+	/* the number of requests currently in the scheduling engine
			
 
				+	 * (not in the req_list anymore) */
			
 
				+	unsigned refcnt;
			
 
				+	starpu_access_mode current_mode;
			
 
				+#else
			
 
				+	/* protect the data itself */
			
 
				+	rw_lock	data_lock;
			
 
				+#endif
			
 
				+	/* protect meta data */
			
 
				+	starpu_mutex header_lock;
			
 
				+
			
 
				+	uint32_t nnodes; /* the number of memory nodes that may use it */
			
 
				+	struct starpu_data_state_t *children;
			
 
				+	int nchildren;
			
 
				+
			
 
				+	/* describe the state of the data in term of coherency */
			
 
				+	local_data_state per_node[MAXNODES];
			
 
				+
			
 
				+	/* describe the actual data layout */
			
 
				+	starpu_data_interface_t interface[MAXNODES];
			
 
				+
			
 
				+	struct data_interface_ops_t *ops;
			
 
				+
			
 
				+	/* where is the data home ? -1 if none yet */
			
 
				+	int data_home;
			
 
				+
			
 
				+	/* what is the default write-back mask for that data ? */
			
 
				+	uint32_t wb_mask;
			
 
				+
			
 
				+	/* allows special optimization */
			
 
				+	uint8_t is_readonly;
			
 
				+
			
 
				+	/* in some case, the application may explicitly tell StarPU that a
			
 
				+ 	 * piece of data is not likely to be used soon again */
			
 
				+	unsigned is_not_important;
			
 
				+} data_state;
			
 
				+
			
 
				+void display_msi_stats(void);
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int fetch_data(data_state *state, starpu_access_mode mode);
			
 
				+//void release_data(data_state *state, uint32_t write_through_mask);
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int _fetch_data(data_state *state, uint32_t requesting_node, uint8_t read, uint8_t write);
			
 
				+
			
 
				+uint32_t get_data_refcnt(data_state *state, uint32_t node);
			
 
				+
			
 
				+void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_t mask);
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int fetch_codelet_input(starpu_buffer_descr *descrs, starpu_data_interface_t *interface, unsigned nbuffers, uint32_t mask);
			
 
				+
			
 
				+void notify_data_modification(data_state *state, uint32_t modifying_node);
			
 
				+
			
 
				+int request_data_allocation(data_state *state, uint32_t node);
			
 
				+
			
 
				+unsigned is_data_present_or_requested(data_state *state, uint32_t node);
			
 
				+
			
 
				+inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node);
			
 
				+
			
 
				+#endif // __COHERENCY__H__
			
--- a/src/datawizard/copy-driver.c
+++ b/src/datawizard/copy-driver.c
@@ -0,0 +1,230 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+#include <datawizard/datastats.h>
			
 
				+#include <common/fxt.h>
			
 
				+#include "copy-driver.h"
			
 
				+#include "memalloc.h"
			
 
				+
			
 
				+mem_node_descr descr;
			
 
				+static pthread_key_t memory_node_key;
			
 
				+
			
 
				+unsigned register_memory_node(node_kind kind)
			
 
				+{
			
 
				+	unsigned nnodes;
			
 
				+	/* ATOMIC_ADD returns the new value ... */
			
 
				+	nnodes = STARPU_ATOMIC_ADD(&descr.nnodes, 1);
			
 
				+
			
 
				+	descr.nodes[nnodes-1] = kind;
			
 
				+	TRACE_NEW_MEM_NODE(nnodes-1);
			
 
				+
			
 
				+	/* for now, there is no queue related to that newly created node */
			
 
				+	descr.queues_count[nnodes-1] = 0;
			
 
				+
			
 
				+	return (nnodes-1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* TODO move in a more appropriate file */
			
 
				+/* attach a queue to a memory node */
			
 
				+void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
			
 
				+{
			
 
				+	unsigned nqueues;
			
 
				+	nqueues = STARPU_ATOMIC_ADD(&descr.queues_count[nodeid], 1);
			
 
				+
			
 
				+	descr.attached_queues[nodeid][nqueues-1] = q;
			
 
				+}
			
 
				+
			
 
				+void wake_all_blocked_workers_on_node(unsigned nodeid)
			
 
				+{
			
 
				+	/* wake up all queues on that node */
			
 
				+	unsigned q_id;
			
 
				+	unsigned nqueues = descr.queues_count[nodeid];
			
 
				+	for (q_id = 0; q_id < nqueues; q_id++)
			
 
				+	{
			
 
				+		struct jobq_s *q;
			
 
				+		q  = descr.attached_queues[nodeid][q_id];
			
 
				+
			
 
				+		/* wake anybody waiting on that queue */
			
 
				+		pthread_mutex_lock(&q->activity_mutex);
			
 
				+		pthread_cond_broadcast(&q->activity_cond);
			
 
				+		pthread_mutex_unlock(&q->activity_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void wake_all_blocked_workers(void)
			
 
				+{
			
 
				+	/* workers may be blocked on the policy's global condition */
			
 
				+	struct sched_policy_s *sched = get_sched_policy();
			
 
				+	pthread_cond_t *sched_cond = &sched->sched_activity_cond;
			
 
				+	pthread_mutex_t *sched_mutex = &sched->sched_activity_mutex;
			
 
				+
			
 
				+	pthread_mutex_lock(sched_mutex);
			
 
				+	pthread_cond_broadcast(sched_cond);
			
 
				+	pthread_mutex_unlock(sched_mutex);
			
 
				+
			
 
				+	/* workers may be blocked on the various queues' conditions */
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < descr.nnodes; node++)
			
 
				+	{
			
 
				+		wake_all_blocked_workers_on_node(node);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void init_memory_nodes()
			
 
				+{
			
 
				+	/* there is no node yet, subsequent nodes will be 
			
 
				+	 * added using register_memory_node */
			
 
				+	descr.nnodes = 0;
			
 
				+
			
 
				+	pthread_key_create(&memory_node_key, NULL);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < MAXNODES; i++) 
			
 
				+	{
			
 
				+		descr.nodes[i] = UNUSED; 
			
 
				+	}
			
 
				+
			
 
				+	init_mem_chunk_lists();
			
 
				+	init_data_request_lists();
			
 
				+}
			
 
				+
			
 
				+void set_local_memory_node_key(unsigned *node)
			
 
				+{
			
 
				+	pthread_setspecific(memory_node_key, node);
			
 
				+}
			
 
				+
			
 
				+unsigned get_local_memory_node(void)
			
 
				+{
			
 
				+	unsigned *memory_node;
			
 
				+	memory_node = pthread_getspecific(memory_node_key);
			
 
				+	
			
 
				+	/* in case this is called by the programmer, we assume the RAM node 
			
 
				+	   is the appropriate memory node ... so we return 0 XXX */
			
 
				+	if (STARPU_UNLIKELY(!memory_node))
			
 
				+		return 0;
			
 
				+
			
 
				+	return *memory_node;
			
 
				+}
			
 
				+
			
 
				+inline node_kind get_node_kind(uint32_t node)
			
 
				+{
			
 
				+	return descr.nodes[node];
			
 
				+}
			
 
				+
			
 
				+int allocate_per_node_buffer(data_state *state, uint32_t node)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (!state->per_node[node].allocated) {
			
 
				+		/* there is no room available for the data yet */
			
 
				+		ret = allocate_memory_on_node(state, node);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENOMEM))
			
 
				+			goto nomem;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+nomem:
			
 
				+	/* there was not enough memory to allocate the buffer */
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_FXT
			
 
				+/* we need to identify each communication so that we can match the beginning
			
 
				+ * and the end of a communication in the trace, so we use a unique identifier
			
 
				+ * per communication */
			
 
				+static unsigned communication_cnt = 0;
			
 
				+#endif
			
 
				+
			
 
				+int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *state, uint32_t src_node, 
			
 
				+				uint32_t dst_node, unsigned donotread)
			
 
				+{
			
 
				+	int ret_alloc, ret_copy;
			
 
				+	unsigned __attribute__((unused)) com_id = 0;
			
 
				+
			
 
				+	/* first make sure the destination has an allocated buffer */
			
 
				+	ret_alloc = allocate_per_node_buffer(state, dst_node);
			
 
				+	if (ret_alloc)
			
 
				+		goto nomem;
			
 
				+
			
 
				+	/* if there is no need to actually read the data, 
			
 
				+	 * we do not perform any transfer */
			
 
				+	if (!donotread) {
			
 
				+		STARPU_ASSERT(state->ops);
			
 
				+		STARPU_ASSERT(state->ops->copy_data_1_to_1);
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+		size_t size = state->ops->get_size(state);
			
 
				+		update_comm_ammount(src_node, dst_node, size);
			
 
				+#endif
			
 
				+		
			
 
				+#ifdef USE_FXT
			
 
				+		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);
			
 
				+#endif
			
 
				+
			
 
				+		/* for now we set the size to 0 in the FxT trace XXX */
			
 
				+		TRACE_START_DRIVER_COPY(src_node, dst_node, 0, com_id);
			
 
				+		ret_copy = state->ops->copy_data_1_to_1(state, src_node, dst_node);
			
 
				+		TRACE_END_DRIVER_COPY(src_node, dst_node, 0, com_id);
			
 
				+
			
 
				+		return ret_copy;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+nomem:
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+static uint32_t choose_src_node(uint32_t src_node_mask)
			
 
				+{
			
 
				+	unsigned src_node = 0;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* first find the node that will be the actual source */
			
 
				+	for (i = 0; i < MAXNODES; i++)
			
 
				+	{
			
 
				+		if (src_node_mask & (1<<i))
			
 
				+		{
			
 
				+			/* this is a potential candidate */
			
 
				+			src_node = i;
			
 
				+
			
 
				+			/* however GPU are expensive sources, really !
			
 
				+			 * 	other should be ok */
			
 
				+			if (descr.nodes[i] != CUDA_RAM)
			
 
				+				break;
			
 
				+
			
 
				+			/* XXX do a better algorithm to distribute the memory copies */
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return src_node;
			
 
				+}
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int driver_copy_data(data_state *state, uint32_t src_node_mask,
			
 
				+			 uint32_t dst_node, unsigned donotread)
			
 
				+{
			
 
				+	int ret;
			
 
				+	uint32_t src_node = choose_src_node(src_node_mask);
			
 
				+
			
 
				+	/* possibly returns -1 if there was no memory left */
			
 
				+	ret = driver_copy_data_1_to_1(state, src_node, dst_node, donotread);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/src/datawizard/copy-driver.h
+++ b/src/datawizard/copy-driver.h
@@ -0,0 +1,67 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __COPY_DRIVER_H__
			
 
				+#define __COPY_DRIVER_H__
			
 
				+
			
 
				+#include "coherency.h"
			
 
				+#include "memalloc.h"
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+typedef enum {
			
 
				+	UNUSED,
			
 
				+	SPU_LS,
			
 
				+	RAM,
			
 
				+	CUDA_RAM
			
 
				+} node_kind;
			
 
				+
			
 
				+typedef struct {
			
 
				+	unsigned nnodes;
			
 
				+	node_kind nodes[MAXNODES];
			
 
				+
			
 
				+	/* the list of queues that are attached to a given node */
			
 
				+	// XXX 32 is set randomly !
			
 
				+	struct jobq_s *attached_queues[MAXNODES][32];
			
 
				+	/* the number of queues attached to each node */
			
 
				+	unsigned queues_count[MAXNODES];
			
 
				+} mem_node_descr;
			
 
				+
			
 
				+struct starpu_data_state_t;
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int driver_copy_data(struct starpu_data_state_t *state, uint32_t src_node_mask, uint32_t dst_node, unsigned donotread);
			
 
				+
			
 
				+void init_memory_nodes(void);
			
 
				+void set_local_memory_node_key(unsigned *node);
			
 
				+unsigned get_local_memory_node(void);
			
 
				+unsigned register_memory_node(node_kind kind);
			
 
				+void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid);
			
 
				+void wake_all_blocked_workers(void);
			
 
				+void wake_all_blocked_workers_on_node(unsigned nodeid);
			
 
				+
			
 
				+node_kind get_node_kind(uint32_t node);
			
 
				+
			
 
				+__attribute__((warn_unused_result))
			
 
				+int driver_copy_data_1_to_1(struct starpu_data_state_t *state, uint32_t node, 
			
 
				+				uint32_t requesting_node, unsigned donotread);
			
 
				+
			
 
				+int allocate_per_node_buffer(struct starpu_data_state_t *state, uint32_t node);
			
 
				+
			
 
				+#endif // __COPY_DRIVER_H__
			
--- a/src/datawizard/data_parameters.h
+++ b/src/datawizard/data_parameters.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATA_PARAMETERS_H__
			
 
				+#define __DATA_PARAMETERS_H__
			
 
				+
			
 
				+#define MAXNODES	6
			
 
				+
			
 
				+#endif // __DATA_PARAMETERS_H__
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -0,0 +1,111 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/data_request.h>
			
 
				+
			
 
				+static data_request_list_t data_requests[MAXNODES];
			
 
				+static starpu_mutex data_requests_mutex[MAXNODES];
			
 
				+
			
 
				+void init_data_request_lists(void)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < MAXNODES; i++)
			
 
				+	{
			
 
				+		data_requests[i] = data_request_list_new();
			
 
				+		init_mutex(&data_requests_mutex[i]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	int retvalue;
			
 
				+
			
 
				+	data_request_t r = data_request_new();
			
 
				+
			
 
				+	r->state = state;
			
 
				+	r->src_node = src_node;
			
 
				+	r->dst_node = dst_node;
			
 
				+	sem_init(&r->sem, 0, 0);
			
 
				+
			
 
				+	/* insert the request in the proper list */
			
 
				+	take_mutex(&data_requests_mutex[src_node]);
			
 
				+	data_request_list_push_front(data_requests[src_node], r);
			
 
				+	release_mutex(&data_requests_mutex[src_node]);
			
 
				+
			
 
				+	/* wake the threads that could perform that operation */
			
 
				+	wake_all_blocked_workers_on_node(src_node);
			
 
				+
			
 
				+	/* wait for the request to be performed */
			
 
				+	//sem_wait(&r->sem);
			
 
				+	//while(sem_trywait(&r->sem) == -1)
			
 
				+	//	wake_all_blocked_workers_on_node(src_node);
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	/* XXX: since there is no concurrency on this data (we don't use the
			
 
				+	 * rw-lock) we can assume that the data on the source node should not
			
 
				+	 * be invalidated.
			
 
				+	 * TODO: handle the situation of a possible invalidation caused by
			
 
				+	 * memory eviction mechanism. This could be done by the means of a
			
 
				+	 * specific state (or flag) in the MSI protocol. */
			
 
				+	release_mutex(&state->header_lock);
			
 
				+#endif
			
 
				+
			
 
				+	while(sem_trywait(&r->sem) == -1)
			
 
				+	{
			
 
				+		wake_all_blocked_workers_on_node(src_node);
			
 
				+		datawizard_progress(dst_node);
			
 
				+	}
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	take_mutex(&state->header_lock);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	retvalue = r->retval;
			
 
				+	
			
 
				+	/* the request is useless now */
			
 
				+	data_request_delete(r);
			
 
				+
			
 
				+	return retvalue;	
			
 
				+}
			
 
				+
			
 
				+void handle_node_data_requests(uint32_t src_node)
			
 
				+{
			
 
				+	take_mutex(&data_requests_mutex[src_node]);
			
 
				+
			
 
				+	/* for all entries of the list */
			
 
				+	data_request_list_t l = data_requests[src_node];
			
 
				+	data_request_t r;
			
 
				+
			
 
				+	while (!data_request_list_empty(l))
			
 
				+	{
			
 
				+		r = data_request_list_pop_back(l);		
			
 
				+		release_mutex(&data_requests_mutex[src_node]);
			
 
				+
			
 
				+		/* TODO : accounting to see how much time was spent working for other people ... */
			
 
				+
			
 
				+		/* perform the transfer */
			
 
				+		/* the header of the data must be locked by the worker that submitted the request */
			
 
				+		r->retval = driver_copy_data_1_to_1(r->state, r->src_node, r->dst_node, 0);
			
 
				+		
			
 
				+		/* wake the requesting worker up */
			
 
				+		sem_post(&r->sem);
			
 
				+
			
 
				+		take_mutex(&data_requests_mutex[src_node]);
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&data_requests_mutex[src_node]);
			
 
				+}
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATA_REQUEST_H__
			
 
				+#define __DATA_REQUEST_H__
			
 
				+
			
 
				+#include <semaphore.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+struct starpu_data_state_t;
			
 
				+
			
 
				+LIST_TYPE(data_request,
			
 
				+	struct starpu_data_state_t *state;
			
 
				+	uint32_t src_node;
			
 
				+	uint32_t dst_node;
			
 
				+	sem_t sem;
			
 
				+	int retval;
			
 
				+);
			
 
				+
			
 
				+void init_data_request_lists(void);
			
 
				+int post_data_request(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+void handle_node_data_requests(uint32_t src_node);
			
 
				+
			
 
				+#endif // __DATA_REQUEST_H__
			
--- a/src/datawizard/datastats.c
+++ b/src/datawizard/datastats.c
@@ -0,0 +1,128 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <datawizard/datastats.h>
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* measure the cache hit ratio for each node */
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+static unsigned hit_cnt[16];
			
 
				+static unsigned miss_cnt[16];
			
 
				+#endif
			
 
				+
			
 
				+inline void msi_cache_hit(unsigned node __attribute__ ((unused)))
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	hit_cnt[node]++;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+inline void msi_cache_miss(unsigned node __attribute__ ((unused)))
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	miss_cnt[node]++;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void display_msi_stats(void)
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	fprintf(stderr, "MSI cache stats :\n");
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < 4; node++) 
			
 
				+	{
			
 
				+		if (hit_cnt[node]+miss_cnt[node]) 
			
 
				+		{
			
 
				+			fprintf(stderr, "memory node %d\n", node);
			
 
				+			fprintf(stderr, "\thit : %u (%2.2f \%%)\n", hit_cnt[node], (100.0f*hit_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
			
 
				+			fprintf(stderr, "\tmiss : %u (%2.2f \%%)\n", miss_cnt[node], (100.0f*miss_cnt[node])/(hit_cnt[node]+miss_cnt[node]));
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* measure the efficiency of our allocation cache */
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+static unsigned alloc_cnt[16];
			
 
				+static unsigned alloc_cache_hit_cnt[16];
			
 
				+#endif
			
 
				+
			
 
				+inline void allocation_cache_hit(unsigned node __attribute__ ((unused)))
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	alloc_cache_hit_cnt[node]++;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+inline void data_allocation_inc_stats(unsigned node __attribute__ ((unused)))
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	alloc_cnt[node]++;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void display_alloc_cache_stats(void)
			
 
				+{
			
 
				+#ifdef DATA_STATS
			
 
				+	fprintf(stderr, "Allocation cache stats:\n");
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < 4; node++) 
			
 
				+	{
			
 
				+		if (alloc_cnt[node]) 
			
 
				+		{
			
 
				+			fprintf(stderr, "memory node %d\n", node);
			
 
				+			fprintf(stderr, "\ttotal alloc : %u\n", alloc_cnt[node]);
			
 
				+			fprintf(stderr, "\tcached alloc: %u (%2.2f \%%)\n", 
			
 
				+				alloc_cache_hit_cnt[node], (100.0f*alloc_cache_hit_cnt[node])/(alloc_cnt[node]));
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* measure the amount of data transfers between each pair of nodes */
			
 
				+#ifdef DATA_STATS
			
 
				+
			
 
				+static size_t comm_ammount[8][8];
			
 
				+
			
 
				+void display_comm_ammounts(void)
			
 
				+{
			
 
				+	unsigned src, dst;
			
 
				+
			
 
				+	for (dst = 0; dst < 8; dst++)
			
 
				+	for (src = 0; src < 8; src++)
			
 
				+	{
			
 
				+		if (comm_ammount[src][dst])
			
 
				+			fprintf(stderr, "Total comm from %d to %d \t%dMB\n", src, dst, ((unsigned)comm_ammount[src][dst])/(1024*1024));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+inline void update_comm_ammount(uint32_t src_node, uint32_t dst_node, size_t size)
			
 
				+{
			
 
				+	comm_ammount[src_node][dst_node] += size;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+inline void display_comm_ammounts(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/src/datawizard/datastats.h
+++ b/src/datawizard/datastats.h
@@ -0,0 +1,40 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATASTATS_H__
			
 
				+#define __DATASTATS_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+
			
 
				+inline void msi_cache_hit(unsigned node);
			
 
				+inline void msi_cache_miss(unsigned node);
			
 
				+
			
 
				+void display_msi_stats(void);
			
 
				+
			
 
				+inline void allocation_cache_hit(unsigned node __attribute__ ((unused)));
			
 
				+inline void data_allocation_inc_stats(unsigned node __attribute__ ((unused)));
			
 
				+
			
 
				+
			
 
				+void display_comm_ammounts(void);
			
 
				+void display_alloc_cache_stats(void);
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+inline void update_comm_ammount(uint32_t src_node, uint32_t dst_node, size_t size);
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -0,0 +1,41 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATAWIZARD_H__
			
 
				+#define __DATAWIZARD_H__
			
 
				+
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/footprint.h>
			
 
				+
			
 
				+#include <datawizard/progress.h>
			
 
				+#include <datawizard/data_request.h>
			
 
				+
			
 
				+#include <datawizard/interfaces/data_interface.h>
			
 
				+
			
 
				+#include <datawizard/interfaces/blas_interface.h>
			
 
				+#include <datawizard/interfaces/vector_interface.h>
			
 
				+#include <datawizard/interfaces/csr_interface.h>
			
 
				+#include <datawizard/interfaces/csc_interface.h>
			
 
				+#include <datawizard/interfaces/bcsr_interface.h>
			
 
				+
			
 
				+#include <datawizard/interfaces/blas_filters.h>
			
 
				+#include <datawizard/interfaces/vector_filters.h>
			
 
				+#include <datawizard/interfaces/csr_filters.h>
			
 
				+#include <datawizard/interfaces/bcsr_filters.h>
			
 
				+
			
 
				+#endif // __DATAWIZARD_H__
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -0,0 +1,45 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/footprint.h>
			
 
				+
			
 
				+void compute_buffers_footprint(job_t j)
			
 
				+{
			
 
				+	uint32_t footprint = 0;
			
 
				+	unsigned buffer;
			
 
				+
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
			
 
				+	{
			
 
				+		data_state *state = task->buffers[buffer].state;
			
 
				+
			
 
				+		STARPU_ASSERT(state->ops);
			
 
				+		STARPU_ASSERT(state->ops->footprint);
			
 
				+
			
 
				+		footprint = state->ops->footprint(state, footprint);
			
 
				+	}
			
 
				+
			
 
				+	j->footprint = footprint;
			
 
				+	j->footprint_is_computed = 1;
			
 
				+}
			
 
				+
			
 
				+inline uint32_t compute_data_footprint(data_state *state)
			
 
				+{
			
 
				+	uint32_t interfaceid = state->ops->interfaceid;
			
 
				+
			
 
				+	return state->ops->footprint(state, interfaceid);
			
 
				+}
			
--- a/src/datawizard/footprint.h
+++ b/src/datawizard/footprint.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __FOOTPRINT_H__
			
 
				+#define __FOOTPRINT_H__
			
 
				+
			
 
				+#include <core/jobs.h>
			
 
				+
			
 
				+struct job_s;
			
 
				+
			
 
				+void compute_buffers_footprint(struct job_s *j);
			
 
				+inline uint32_t compute_data_footprint(data_state *state);
			
 
				+
			
 
				+#endif // __FOOTPRINT_H__
			
--- a/src/datawizard/hierarchy.c
+++ b/src/datawizard/hierarchy.c
@@ -0,0 +1,327 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "hierarchy.h"
			
 
				+
			
 
				+/* 
			
 
				+ * Stop monitoring a data
			
 
				+ */
			
 
				+/* TODO : move in a more appropriate file */
			
 
				+void starpu_delete_data(data_state *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+
			
 
				+	STARPU_ASSERT(state);
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		local_data_state *local = &state->per_node[node];
			
 
				+
			
 
				+		if (local->allocated && local->automatically_allocated){
			
 
				+			/* free the data copy in a lazy fashion */
			
 
				+			request_mem_chunk_removal(state, node);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+	data_requester_list_delete(state->req_list);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void monitor_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
			
 
				+{
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	/* initialize the new lock */
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+	init_rw_lock(&state->data_lock);
			
 
				+#else
			
 
				+	state->req_list = data_requester_list_new();
			
 
				+	state->refcnt = 0;
			
 
				+#endif
			
 
				+	init_mutex(&state->header_lock);
			
 
				+
			
 
				+	/* first take care to properly lock the data */
			
 
				+	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	/* we assume that all nodes may use that data */
			
 
				+	state->nnodes = MAXNODES;
			
 
				+
			
 
				+	/* there is no hierarchy yet */
			
 
				+	state->nchildren = 0;
			
 
				+
			
 
				+	state->is_not_important = 0;
			
 
				+
			
 
				+	/* make sure we do have a valid copy */
			
 
				+	STARPU_ASSERT(home_node < MAXNODES);
			
 
				+
			
 
				+	state->wb_mask = wb_mask;
			
 
				+
			
 
				+	/* that new data is invalid from all nodes perpective except for the
			
 
				+	 * home node */
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		if (node == home_node) {
			
 
				+			/* this is the home node with the only valid copy */
			
 
				+			state->per_node[node].state = OWNER;
			
 
				+			state->per_node[node].allocated = 1;
			
 
				+			state->per_node[node].automatically_allocated = 0;
			
 
				+			state->per_node[node].refcnt = 0;
			
 
				+		}
			
 
				+		else {
			
 
				+			/* the value is not available here yet */
			
 
				+			state->per_node[node].state = INVALID;
			
 
				+			state->per_node[node].allocated = 0;
			
 
				+			state->per_node[node].refcnt = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* now the data is available ! */
			
 
				+	release_mutex(&state->header_lock);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This function applies a starpu_filter on all the elements of a partition
			
 
				+ */
			
 
				+static void map_filter(data_state *root_data, starpu_filter *f)
			
 
				+{
			
 
				+	/* we need to apply the starpu_filter on all leaf of the tree */
			
 
				+	if (root_data->nchildren == 0) 
			
 
				+	{
			
 
				+		/* this is a leaf */
			
 
				+		starpu_partition_data(root_data, f);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* try to apply the starpu_filter recursively */
			
 
				+		int child;
			
 
				+		for (child = 0; child < root_data->nchildren; child++)
			
 
				+		{
			
 
				+			map_filter(&root_data->children[child], f);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_map_filters(data_state *root_data, unsigned nfilters, ...)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	va_list pa;
			
 
				+	va_start(pa, nfilters);
			
 
				+	for (i = 0; i < nfilters; i++)
			
 
				+	{
			
 
				+		starpu_filter *next_filter;
			
 
				+		next_filter = va_arg(pa, starpu_filter *);
			
 
				+
			
 
				+		STARPU_ASSERT(next_filter);
			
 
				+
			
 
				+		map_filter(root_data, next_filter);
			
 
				+	}
			
 
				+	va_end(pa);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * example get_sub_data(data_state *root_data, 3, 42, 0, 1);
			
 
				+ */
			
 
				+data_state *get_sub_data(data_state *root_data, unsigned depth, ... )
			
 
				+{
			
 
				+	STARPU_ASSERT(root_data);
			
 
				+	data_state *current_data = root_data;
			
 
				+
			
 
				+	/* the variable number of argument must correlate the depth in the tree */
			
 
				+	unsigned i; 
			
 
				+	va_list pa;
			
 
				+	va_start(pa, depth);
			
 
				+	for (i = 0; i < depth; i++)
			
 
				+	{
			
 
				+		unsigned next_child;
			
 
				+		next_child = va_arg(pa, unsigned);
			
 
				+
			
 
				+		STARPU_ASSERT((int)next_child < current_data->nchildren);
			
 
				+
			
 
				+		current_data = &current_data->children[next_child];
			
 
				+	}
			
 
				+	va_end(pa);
			
 
				+
			
 
				+	return current_data;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * For now, we assume that partitionned_data is already properly allocated;
			
 
				+ * at least by the starpu_filter function !
			
 
				+ */
			
 
				+void starpu_partition_data(data_state *initial_data, starpu_filter *f)
			
 
				+{
			
 
				+	int nparts;
			
 
				+	int i;
			
 
				+
			
 
				+	/* first take care to properly lock the data header */
			
 
				+	take_mutex(&initial_data->header_lock);
			
 
				+
			
 
				+	/* there should not be mutiple filters applied on the same data */
			
 
				+	STARPU_ASSERT(initial_data->nchildren == 0);
			
 
				+
			
 
				+	/* this should update the pointers and size of the chunk */
			
 
				+	nparts = f->filter_func(f, initial_data);
			
 
				+	STARPU_ASSERT(nparts > 0);
			
 
				+
			
 
				+	initial_data->nchildren = nparts;
			
 
				+
			
 
				+	for (i = 0; i < nparts; i++)
			
 
				+	{
			
 
				+		data_state *children = &initial_data->children[i];
			
 
				+
			
 
				+		STARPU_ASSERT(children);
			
 
				+
			
 
				+		children->nchildren = 0;
			
 
				+
			
 
				+		children->is_not_important = initial_data->is_not_important;
			
 
				+
			
 
				+		/* it is possible that the children does not use the same interface as the parent,
			
 
				+		 * in that case, the starpu_filter must set the proper methods */
			
 
				+		if (!children->ops)
			
 
				+			children->ops = initial_data->ops;
			
 
				+
			
 
				+		children->wb_mask = initial_data->wb_mask;
			
 
				+
			
 
				+		/* initialize the chunk lock */
			
 
				+#ifndef NO_DATA_RW_LOCK
			
 
				+		init_rw_lock(&children->data_lock);
			
 
				+#else
			
 
				+		children->req_list = data_requester_list_new();
			
 
				+		children->refcnt = 0;
			
 
				+#endif
			
 
				+		init_mutex(&children->header_lock);
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			children->per_node[node].state = 
			
 
				+				initial_data->per_node[node].state;
			
 
				+			children->per_node[node].allocated = 
			
 
				+				initial_data->per_node[node].allocated;
			
 
				+			children->per_node[node].automatically_allocated = initial_data->per_node[node].automatically_allocated;
			
 
				+			children->per_node[node].refcnt = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* now let the header */
			
 
				+	release_mutex(&initial_data->header_lock);
			
 
				+}
			
 
				+
			
 
				+void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
			
 
				+{
			
 
				+	int child;
			
 
				+	unsigned node;
			
 
				+
			
 
				+	take_mutex(&root_data->header_lock);
			
 
				+
			
 
				+#ifdef NO_DATA_RW_LOCK
			
 
				+#warning starpu_unpartition_data is not supported with NO_DATA_RW_LOCK yet ...
			
 
				+#endif
			
 
				+
			
 
				+	/* first take all the children lock (in order !) */
			
 
				+	for (child = 0; child < root_data->nchildren; child++)
			
 
				+	{
			
 
				+		/* make sure the intermediate children is unpartitionned as well */
			
 
				+		if (root_data->children[child].nchildren > 0)
			
 
				+			starpu_unpartition_data(&root_data->children[child], gathering_node);
			
 
				+
			
 
				+		int ret;
			
 
				+		ret = _fetch_data(&root_data->children[child], gathering_node, 1, 0);
			
 
				+		/* for now we pretend that the RAM is almost unlimited and that gathering 
			
 
				+		 * data should be possible from the node that does the unpartionning ... we
			
 
				+		 * don't want to have the programming deal with memory shortage at that time,
			
 
				+		 * really */
			
 
				+		STARPU_ASSERT(ret == 0); 
			
 
				+	}
			
 
				+
			
 
				+	/* the gathering_node should now have a valid copy of all the children.
			
 
				+	 * For all nodes, if the node had all copies and none was locally
			
 
				+	 * allocated then the data is still valid there, else, it's invalidated
			
 
				+	 * for the gathering node, if we have some locally allocated data, we 
			
 
				+	 * copy all the children (XXX this should not happen so we just do not
			
 
				+	 * do anything since this is transparent ?) */
			
 
				+	unsigned still_valid[MAXNODES];
			
 
				+
			
 
				+	/* we do 2 passes : the first pass determines wether the data is still
			
 
				+	 * valid or not, the second pass is needed to choose between SHARED and
			
 
				+	 * OWNER */
			
 
				+
			
 
				+	unsigned nvalids = 0;
			
 
				+
			
 
				+	/* still valid ? */
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		/* until an issue is found the data is assumed to be valid */
			
 
				+		unsigned isvalid = 1;
			
 
				+
			
 
				+		for (child = 0; child < root_data->nchildren; child++)
			
 
				+		{
			
 
				+			local_data_state *local = &root_data->children[child].per_node[node];
			
 
				+
			
 
				+			if (local->state == INVALID) {
			
 
				+				isvalid = 0; 
			
 
				+			}
			
 
				+	
			
 
				+			if (local->allocated && local->automatically_allocated){
			
 
				+				/* free the data copy in a lazy fashion */
			
 
				+				request_mem_chunk_removal(root_data, node);
			
 
				+				isvalid = 0; 
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* no problem was found so the node still has a valid copy */
			
 
				+		still_valid[node] = isvalid;
			
 
				+		nvalids++;
			
 
				+	}
			
 
				+
			
 
				+	/* either shared or owned */
			
 
				+	STARPU_ASSERT(nvalids > 0);
			
 
				+
			
 
				+	cache_state newstate = (nvalids == 1)?OWNER:SHARED;
			
 
				+
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		root_data->per_node[node].state = 
			
 
				+			still_valid[node]?newstate:INVALID;
			
 
				+	}
			
 
				+
			
 
				+	/* there is no child anymore */
			
 
				+	root_data->nchildren = 0;
			
 
				+
			
 
				+	/* now the parent may be used again so we release the lock */
			
 
				+	release_mutex(&root_data->header_lock);
			
 
				+}
			
 
				+
			
 
				+void starpu_advise_if_data_is_important(data_state *state, unsigned is_important)
			
 
				+{
			
 
				+
			
 
				+	take_mutex(&state->header_lock);
			
 
				+
			
 
				+	/* first take all the children lock (in order !) */
			
 
				+	int child;
			
 
				+	for (child = 0; child < state->nchildren; child++)
			
 
				+	{
			
 
				+		/* make sure the intermediate children is advised as well */
			
 
				+		if (state->children[child].nchildren > 0)
			
 
				+			starpu_advise_if_data_is_important(&state->children[child], is_important);
			
 
				+	}
			
 
				+
			
 
				+	state->is_not_important = !is_important;
			
 
				+
			
 
				+	/* now the parent may be used again so we release the lock */
			
 
				+	release_mutex(&state->header_lock);
			
 
				+
			
 
				+}
			
--- a/src/datawizard/hierarchy.h
+++ b/src/datawizard/hierarchy.h
@@ -0,0 +1,28 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __HIERARCHY_H__
			
 
				+#define __HIERARCHY_H__
			
 
				+
			
 
				+#include <stdarg.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/memalloc.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+void monitor_new_data(struct starpu_data_state_t *state, uint32_t home_node, uint32_t wb_mask);
			
 
				+
			
 
				+#endif
			
--- a/src/datawizard/interfaces/Makefile
+++ b/src/datawizard/interfaces/Makefile
@@ -0,0 +1,29 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+OBJS := bcsr_interface.o csr_interface.o blas_filters.o blas_interface.o vector_interface.o bcsr_filters.o csr_filters.o vector_filters.o
			
 
				+
			
 
				+all: $(OBJS)
			
 
				+
			
 
				+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
			
 
				+%.d: %.c
			
 
				+	$(CC) $(CFLAGS) $< -MM -o $*.d
			
 
				+
			
 
				+-include $(OBJS:.o=.d)
			
 
				+endif
			
 
				+
			
 
				+clean:
			
 
				+	@rm -f *.o *.d *.gcno *.gcda
			
--- a/src/datawizard/interfaces/bcsr_filters.c
+++ b/src/datawizard/interfaces/bcsr_filters.c
@@ -0,0 +1,77 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "bcsr_filters.h"
			
 
				+#include "bcsr_interface.h"
			
 
				+#include "blas_filters.h"
			
 
				+#include "blas_interface.h"
			
 
				+
			
 
				+extern struct data_interface_ops_t interface_blas_ops;
			
 
				+
			
 
				+unsigned starpu_canonical_block_filter_bcsr(starpu_filter *f __attribute__((unused)), data_state *root_data)
			
 
				+{
			
 
				+	unsigned nchunks;
			
 
				+
			
 
				+	uint32_t nnz = root_data->interface[0].bcsr.nnz;
			
 
				+
			
 
				+	size_t elemsize = root_data->interface[0].bcsr.elemsize;
			
 
				+	uint32_t firstentry = root_data->interface[0].bcsr.firstentry;
			
 
				+
			
 
				+	/* size of the tiles */
			
 
				+	uint32_t r = root_data->interface[0].bcsr.r;
			
 
				+	uint32_t c = root_data->interface[0].bcsr.c;
			
 
				+
			
 
				+	/* we create as many subdata as there are blocks ... */
			
 
				+	nchunks = nnz;
			
 
				+	
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+
			
 
				+	/* XXX */
			
 
				+	STARPU_ASSERT(root_data->per_node[0].allocated);
			
 
				+
			
 
				+	/* each chunk becomes a small dense matrix */
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t ptr_offset = c*r*chunk*elemsize;
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
			
 
				+
			
 
				+			local->nx = c;
			
 
				+			local->ny = r;
			
 
				+			local->ld = c;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				uint8_t *nzval = (uint8_t *)(root_data->interface[node].bcsr.nzval);
			
 
				+				local->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		struct starpu_data_state_t *state = &root_data->children[chunk];
			
 
				+		state->ops = &interface_blas_ops;
			
 
				+	}
			
 
				+
			
 
				+	return nchunks;
			
 
				+
			
 
				+}
			
--- a/src/datawizard/interfaces/bcsr_filters.h
+++ b/src/datawizard/interfaces/bcsr_filters.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BCSR_FILTERS_H__
			
 
				+#define __BCSR_FILTERS_H__
			
 
				+
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#endif // __BCSR_FILTERS_H__
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -0,0 +1,491 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include <common/hash.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				+ */
			
 
				+size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
			
 
				+void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				+size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer);
			
 
				+int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+size_t bcsr_interface_get_size(struct starpu_data_state_t *state);
			
 
				+uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate);
			
 
				+
			
 
				+struct data_interface_ops_t interface_bcsr_ops = {
			
 
				+	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
			
 
				+	.liberate_data_on_node = liberate_bcsr_buffer_on_node,
			
 
				+	.copy_data_1_to_1 = do_copy_bcsr_buffer_1_to_1,
			
 
				+	.dump_data_interface = dump_bcsr_interface,
			
 
				+	.get_size = bcsr_interface_get_size,
			
 
				+	.interfaceid = BCSR_INTERFACE,
			
 
				+	.footprint = footprint_bcsr_interface_crc32
			
 
				+};
			
 
				+
			
 
				+void starpu_monitor_bcsr_data(struct starpu_data_state_t **handle, uint32_t home_node,
			
 
				+		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry,  uint32_t r, uint32_t c, size_t elemsize)
			
 
				+{
			
 
				+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	STARPU_ASSERT(handle);
			
 
				+	*handle = state;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_bcsr_interface_t *local_interface = &state->interface[node].bcsr;
			
 
				+
			
 
				+		if (node == home_node) {
			
 
				+			local_interface->nzval = nzval;
			
 
				+			local_interface->colind = colind;
			
 
				+			local_interface->rowptr = rowptr;
			
 
				+		}
			
 
				+		else {
			
 
				+			local_interface->nzval = 0;
			
 
				+			local_interface->colind = NULL;
			
 
				+			local_interface->rowptr = NULL;
			
 
				+		}
			
 
				+
			
 
				+		local_interface->nnz = nnz;
			
 
				+		local_interface->nrow = nrow;
			
 
				+		local_interface->firstentry = firstentry;
			
 
				+		local_interface->r = r;
			
 
				+		local_interface->c = c;
			
 
				+		local_interface->elemsize = elemsize;
			
 
				+	}
			
 
				+
			
 
				+	state->ops = &interface_bcsr_ops;
			
 
				+
			
 
				+	monitor_new_data(state, home_node, 0);
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t footprint_bcsr_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	uint32_t hash;
			
 
				+
			
 
				+	hash = hstate;
			
 
				+	hash = hash_func(starpu_get_bcsr_nnz(state), hash);
			
 
				+	hash = hash_func(starpu_get_bcsr_c(state), hash);
			
 
				+	hash = hash_func(starpu_get_bcsr_r(state), hash);
			
 
				+
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	return footprint_bcsr_interface_generic(crc32_be, state, hstate);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct dumped_bcsr_interface_s {
			
 
				+	uint32_t nnz;
			
 
				+	uint32_t nrow;
			
 
				+	uintptr_t nzval;
			
 
				+	uint32_t *colind;
			
 
				+	uint32_t *rowptr;
			
 
				+	uint32_t firstentry;
			
 
				+	uint32_t r;
			
 
				+	uint32_t c;
			
 
				+	uint32_t elemsize;
			
 
				+}  __attribute__ ((packed));
			
 
				+
			
 
				+size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer)
			
 
				+{
			
 
				+	/* yes, that's DIRTY ... */
			
 
				+	struct dumped_bcsr_interface_s *buffer = _buffer;
			
 
				+
			
 
				+	buffer->nnz = (*interface).bcsr.nnz;
			
 
				+	buffer->nrow = (*interface).bcsr.nrow;
			
 
				+	buffer->nzval = (*interface).bcsr.nzval;
			
 
				+	buffer->colind = (*interface).bcsr.colind;
			
 
				+	buffer->rowptr = (*interface).bcsr.rowptr;
			
 
				+	buffer->firstentry = (*interface).bcsr.firstentry;
			
 
				+	buffer->r = (*interface).bcsr.r;
			
 
				+	buffer->c = (*interface).bcsr.c;
			
 
				+	buffer->elemsize = (*interface).bcsr.elemsize;
			
 
				+
			
 
				+	return (sizeof(struct dumped_bcsr_interface_s));
			
 
				+}
			
 
				+
			
 
				+/* offer an access to the data parameters */
			
 
				+uint32_t starpu_get_bcsr_nnz(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.nnz);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_bcsr_nrow(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.nrow);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_bcsr_firstentry(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.firstentry);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_bcsr_r(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.r);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_bcsr_c(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.c);
			
 
				+}
			
 
				+
			
 
				+size_t starpu_get_bcsr_elemsize(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].bcsr.elemsize);
			
 
				+}
			
 
				+
			
 
				+uintptr_t starpu_get_bcsr_local_nzval(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].bcsr.nzval);
			
 
				+}
			
 
				+
			
 
				+uint32_t *starpu_get_bcsr_local_colind(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+//	unsigned node;
			
 
				+//	node = get_local_memory_node();
			
 
				+//
			
 
				+//	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+//
			
 
				+//	return (state->interface[node].bcsr.colind);
			
 
				+
			
 
				+	/* XXX */
			
 
				+	return (state->interface[0].bcsr.colind);
			
 
				+}
			
 
				+
			
 
				+uint32_t *starpu_get_bcsr_local_rowptr(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+//	unsigned node;
			
 
				+//	node = get_local_memory_node();
			
 
				+//
			
 
				+//	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+//
			
 
				+//	return (state->interface[node].bcsr.rowptr);
			
 
				+	
			
 
				+	/* XXX */
			
 
				+	return (state->interface[0].bcsr.rowptr);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+size_t bcsr_interface_get_size(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	size_t size;
			
 
				+
			
 
				+	uint32_t nnz = starpu_get_bcsr_nnz(state);
			
 
				+	uint32_t nrow = starpu_get_bcsr_nrow(state);
			
 
				+	uint32_t r = starpu_get_bcsr_r(state);
			
 
				+	uint32_t c = starpu_get_bcsr_c(state);
			
 
				+	size_t elemsize = starpu_get_bcsr_elemsize(state);
			
 
				+
			
 
				+	size = nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t); 
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* memory allocation/deallocation primitives for the BLAS interface */
			
 
				+
			
 
				+/* returns the size of the allocated area */
			
 
				+size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node)
			
 
				+{
			
 
				+	uintptr_t addr_nzval;
			
 
				+	uint32_t *addr_colind, *addr_rowptr;
			
 
				+	size_t allocated_memory;
			
 
				+
			
 
				+	/* we need the 3 arrays to be allocated */
			
 
				+
			
 
				+	uint32_t nnz = state->interface[dst_node].bcsr.nnz;
			
 
				+	uint32_t nrow = state->interface[dst_node].bcsr.nrow;
			
 
				+	size_t elemsize = state->interface[dst_node].bcsr.elemsize;
			
 
				+
			
 
				+	uint32_t r = state->interface[dst_node].bcsr.r;
			
 
				+	uint32_t c = state->interface[dst_node].bcsr.c;
			
 
				+
			
 
				+	node_kind kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			addr_nzval = (uintptr_t)malloc(nnz*r*c*elemsize);
			
 
				+			if (!addr_nzval)
			
 
				+				goto fail_nzval;
			
 
				+
			
 
				+			addr_colind = malloc(nnz*sizeof(uint32_t));
			
 
				+			if (!addr_colind)
			
 
				+				goto fail_colind;
			
 
				+
			
 
				+			addr_rowptr = malloc((nrow+1)*sizeof(uint32_t));
			
 
				+			if (!addr_rowptr)
			
 
				+				goto fail_rowptr;
			
 
				+
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasAlloc(nnz*r*c, elemsize, (void **)&addr_nzval);
			
 
				+			if (!addr_nzval)
			
 
				+				goto fail_nzval;
			
 
				+
			
 
				+			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
			
 
				+			if (!addr_colind)
			
 
				+				goto fail_colind;
			
 
				+
			
 
				+			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
			
 
				+			if (!addr_rowptr)
			
 
				+				goto fail_rowptr;
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+	/* allocation succeeded */
			
 
				+	allocated_memory = 
			
 
				+		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				+
			
 
				+	/* update the data properly in consequence */
			
 
				+	state->interface[dst_node].bcsr.nzval = addr_nzval;
			
 
				+	state->interface[dst_node].bcsr.colind = addr_colind;
			
 
				+	state->interface[dst_node].bcsr.rowptr = addr_rowptr;
			
 
				+	
			
 
				+	return allocated_memory;
			
 
				+
			
 
				+fail_rowptr:
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void *)addr_colind);
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)addr_colind);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+fail_colind:
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void *)addr_nzval);
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)addr_nzval);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+fail_nzval:
			
 
				+
			
 
				+	/* allocation failed */
			
 
				+	allocated_memory = 0;
			
 
				+
			
 
				+	return allocated_memory;
			
 
				+}
			
 
				+
			
 
				+void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
			
 
				+{
			
 
				+	node_kind kind = get_node_kind(node);
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void*)interface->bcsr.nzval);
			
 
				+			free((void*)interface->bcsr.colind);
			
 
				+			free((void*)interface->bcsr.rowptr);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)interface->bcsr.nzval);
			
 
				+			cublasFree((void*)interface->bcsr.colind);
			
 
				+			cublasFree((void*)interface->bcsr.rowptr);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_bcsr_interface_t *src_bcsr;
			
 
				+	starpu_bcsr_interface_t *dst_bcsr;
			
 
				+
			
 
				+	src_bcsr = &state->interface[src_node].bcsr;
			
 
				+	dst_bcsr = &state->interface[dst_node].bcsr;
			
 
				+
			
 
				+	uint32_t nnz = src_bcsr->nnz;
			
 
				+	uint32_t nrow = src_bcsr->nrow;
			
 
				+	size_t elemsize = src_bcsr->elemsize;
			
 
				+
			
 
				+	uint32_t r = src_bcsr->r;
			
 
				+	uint32_t c = src_bcsr->c;
			
 
				+
			
 
				+	cublasGetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
			
 
				+			 		   (uint8_t *)dst_bcsr->nzval, 1);
			
 
				+
			
 
				+	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
			
 
				+						(uint8_t *)dst_bcsr->colind, 1);
			
 
				+
			
 
				+	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
			
 
				+						(uint8_t *)dst_bcsr->rowptr, 1);
			
 
				+	
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_bcsr_interface_t *src_bcsr;
			
 
				+	starpu_bcsr_interface_t *dst_bcsr;
			
 
				+
			
 
				+	src_bcsr = &state->interface[src_node].bcsr;
			
 
				+	dst_bcsr = &state->interface[dst_node].bcsr;
			
 
				+
			
 
				+	uint32_t nnz = src_bcsr->nnz;
			
 
				+	uint32_t nrow = src_bcsr->nrow;
			
 
				+	size_t elemsize = src_bcsr->elemsize;
			
 
				+
			
 
				+	uint32_t r = src_bcsr->r;
			
 
				+	uint32_t c = src_bcsr->c;
			
 
				+
			
 
				+	cublasSetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
			
 
				+					(uint8_t *)dst_bcsr->nzval, 1);
			
 
				+
			
 
				+	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
			
 
				+						(uint8_t *)dst_bcsr->colind, 1);
			
 
				+
			
 
				+	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
			
 
				+						(uint8_t *)dst_bcsr->rowptr, 1);
			
 
				+	
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+}
			
 
				+#endif // USE_CUDA
			
 
				+
			
 
				+/* as not all platform easily have a BLAS lib installed ... */
			
 
				+static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+
			
 
				+	starpu_bcsr_interface_t *src_bcsr;
			
 
				+	starpu_bcsr_interface_t *dst_bcsr;
			
 
				+
			
 
				+	src_bcsr = &state->interface[src_node].bcsr;
			
 
				+	dst_bcsr = &state->interface[dst_node].bcsr;
			
 
				+
			
 
				+	uint32_t nnz = src_bcsr->nnz;
			
 
				+	uint32_t nrow = src_bcsr->nrow;
			
 
				+	size_t elemsize = src_bcsr->elemsize;
			
 
				+
			
 
				+	uint32_t r = src_bcsr->r;
			
 
				+	uint32_t c = src_bcsr->c;
			
 
				+
			
 
				+	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
			
 
				+
			
 
				+	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
			
 
				+
			
 
				+	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	node_kind src_kind = get_node_kind(src_node);
			
 
				+	node_kind dst_kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch (dst_kind) {
			
 
				+	case RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> RAM */
			
 
				+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				+				 break;
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_RAM:
			
 
				+				/* CUBLAS_RAM -> RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				if (get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					post_data_request(state, src_node, dst_node);
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+				printf("error node %d UNUSED\n", src_node);
			
 
				+			default:
			
 
				+				assert(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#ifdef USE_CUDA
			
 
				+	case CUDA_RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> CUBLAS_RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				+				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				+				break;
			
 
				+			case CUDA_RAM:
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO 
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				+	case SPU_LS:
			
 
				+		STARPU_ASSERT(0); // TODO
			
 
				+		break;
			
 
				+	case UNUSED:
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/src/datawizard/interfaces/bcsr_interface.h
+++ b/src/datawizard/interfaces/bcsr_interface.h
@@ -0,0 +1,26 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BCSR_INTERFACE_H__
			
 
				+#define __BCSR_INTERFACE_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+/* this interface is used for Sparse matrices */
			
 
				+
			
 
				+#define BCSR_INTERFACE	0x118504
			
 
				+
			
 
				+#endif // __BCSR_INTERFACE_H__
			
--- a/src/datawizard/interfaces/blas_filters.c
+++ b/src/datawizard/interfaces/blas_filters.c
@@ -0,0 +1,113 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "blas_filters.h"
			
 
				+#include "blas_interface.h"
			
 
				+
			
 
				+/*
			
 
				+ * an example of a dummy partition function : blocks ...
			
 
				+ */
			
 
				+unsigned starpu_block_filter_func(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	unsigned nchunks;
			
 
				+	uint32_t arg = f->filter_arg;
			
 
				+
			
 
				+	starpu_blas_interface_t *blas_root = &root_data->interface[0].blas;
			
 
				+	uint32_t nx = blas_root->nx;
			
 
				+	uint32_t ny = blas_root->ny;
			
 
				+	size_t elemsize = blas_root->elemsize;
			
 
				+
			
 
				+	/* we will have arg chunks */
			
 
				+	nchunks = STARPU_MIN(nx, arg);
			
 
				+
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
			
 
				+		size_t offset = chunk*chunk_size*elemsize;
			
 
				+
			
 
				+		uint32_t child_nx = 
			
 
				+			STARPU_MIN(chunk_size, nx - chunk*chunk_size);
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
			
 
				+
			
 
				+			local->nx = child_nx;
			
 
				+			local->ny = ny;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				local->ptr = root_data->interface[node].blas.ptr + offset;
			
 
				+				local->ld = root_data->interface[node].blas.ld;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nchunks;
			
 
				+}
			
 
				+
			
 
				+unsigned starpu_vertical_block_filter_func(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	unsigned nchunks;
			
 
				+	uint32_t arg = f->filter_arg;
			
 
				+
			
 
				+	uint32_t nx = root_data->interface[0].blas.nx;
			
 
				+	uint32_t ny = root_data->interface[0].blas.ny;
			
 
				+	size_t elemsize = root_data->interface[0].blas.elemsize;
			
 
				+
			
 
				+	/* we will have arg chunks */
			
 
				+	nchunks = STARPU_MIN(ny, arg);
			
 
				+	
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t chunk_size = (ny + nchunks - 1)/nchunks;
			
 
				+
			
 
				+		uint32_t child_ny = 
			
 
				+			STARPU_MIN(chunk_size, ny - chunk*chunk_size);
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_blas_interface_t *local = &root_data->children[chunk].interface[node].blas;
			
 
				+
			
 
				+			local->nx = nx;
			
 
				+			local->ny = child_ny;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				size_t offset = 
			
 
				+					chunk*chunk_size*root_data->interface[node].blas.ld*elemsize;
			
 
				+				local->ptr = root_data->interface[node].blas.ptr + offset;
			
 
				+				local->ld = root_data->interface[node].blas.ld;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nchunks;
			
 
				+}
			
--- a/src/datawizard/interfaces/blas_filters.h
+++ b/src/datawizard/interfaces/blas_filters.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BLAS_FILTERS_H__
			
 
				+#define __BLAS_FILTERS_H__
			
 
				+
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#endif // __BLAS_FILTERS_H__
			
--- a/src/datawizard/interfaces/blas_interface.c
+++ b/src/datawizard/interfaces/blas_interface.c
@@ -0,0 +1,413 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#include <common/hash.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node);
			
 
				+void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				+int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
			
 
				+size_t dump_blas_interface(starpu_data_interface_t *interface, void *buffer);
			
 
				+size_t blas_interface_get_size(struct starpu_data_state_t *state);
			
 
				+uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate);
			
 
				+void display_blas_interface(data_state *state, FILE *f);
			
 
				+#ifdef USE_GORDON
			
 
				+int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+#endif
			
 
				+
			
 
				+struct data_interface_ops_t interface_blas_ops = {
			
 
				+	.allocate_data_on_node = allocate_blas_buffer_on_node,
			
 
				+	.liberate_data_on_node = liberate_blas_buffer_on_node,
			
 
				+	.copy_data_1_to_1 = do_copy_blas_buffer_1_to_1,
			
 
				+	.dump_data_interface = dump_blas_interface,
			
 
				+	.get_size = blas_interface_get_size,
			
 
				+	.footprint = footprint_blas_interface_crc32,
			
 
				+#ifdef USE_GORDON
			
 
				+	.convert_to_gordon = convert_blas_to_gordon,
			
 
				+#endif
			
 
				+	.interfaceid = BLAS_INTERFACE, 
			
 
				+	.display = display_blas_interface
			
 
				+};
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+{
			
 
				+	STARPU_ASSERT(gordon_interface);
			
 
				+
			
 
				+	size_t elemsize = (*interface).blas.elemsize;
			
 
				+	uint32_t nx = (*interface).blas.nx;
			
 
				+	uint32_t ny = (*interface).blas.ny;
			
 
				+	uint32_t ld = (*interface).blas.ld;
			
 
				+
			
 
				+	*ptr = (*interface).blas.ptr;
			
 
				+
			
 
				+	/* The gordon_stride_init function may use a contiguous buffer
			
 
				+ 	 * in case nx = ld (in that case, (*ss).size = elemsize*nx*ny */
			
 
				+	*ss = gordon_stride_init(ny, nx*elemsize, ld*elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* declare a new data with the BLAS interface */
			
 
				+void starpu_monitor_blas_data(struct starpu_data_state_t **handle, uint32_t home_node,
			
 
				+			uintptr_t ptr, uint32_t ld, uint32_t nx,
			
 
				+			uint32_t ny, size_t elemsize)
			
 
				+{
			
 
				+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	STARPU_ASSERT(handle);
			
 
				+	*handle = state;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_blas_interface_t *local_interface = &state->interface[node].blas;
			
 
				+
			
 
				+		if (node == home_node) {
			
 
				+			local_interface->ptr = ptr;
			
 
				+			local_interface->ld  = ld;
			
 
				+		}
			
 
				+		else {
			
 
				+			local_interface->ptr = 0;
			
 
				+			local_interface->ld  = 0;
			
 
				+		}
			
 
				+
			
 
				+		local_interface->nx = nx;
			
 
				+		local_interface->ny = ny;
			
 
				+		local_interface->elemsize = elemsize;
			
 
				+	}
			
 
				+
			
 
				+	state->ops = &interface_blas_ops;
			
 
				+
			
 
				+	monitor_new_data(state, home_node, 0);
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t footprint_blas_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	uint32_t hash;
			
 
				+
			
 
				+	hash = hstate;
			
 
				+	hash = hash_func(starpu_get_blas_nx(state), hash);
			
 
				+	hash = hash_func(starpu_get_blas_ny(state), hash);
			
 
				+
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	return footprint_blas_interface_generic(crc32_be, state, hstate);
			
 
				+}
			
 
				+
			
 
				+struct dumped_blas_interface_s {
			
 
				+	uintptr_t ptr;
			
 
				+	uint32_t nx;
			
 
				+	uint32_t ny;
			
 
				+	uint32_t ld;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+void display_blas_interface(data_state *state, FILE *f)
			
 
				+{
			
 
				+	starpu_blas_interface_t *interface;
			
 
				+
			
 
				+	interface = &state->interface[0].blas;
			
 
				+
			
 
				+	fprintf(f, "%d\t%d\t", interface->nx, interface->ny);
			
 
				+}
			
 
				+
			
 
				+size_t dump_blas_interface(starpu_data_interface_t *interface, void *_buffer)
			
 
				+{
			
 
				+	/* yes, that's DIRTY ... */
			
 
				+	struct dumped_blas_interface_s *buffer = _buffer;
			
 
				+
			
 
				+	buffer->ptr = (*interface).blas.ptr;
			
 
				+	buffer->nx = (*interface).blas.nx;
			
 
				+	buffer->ny = (*interface).blas.ny;
			
 
				+	buffer->ld = (*interface).blas.ld;
			
 
				+
			
 
				+	return (sizeof(struct dumped_blas_interface_s));
			
 
				+}
			
 
				+
			
 
				+size_t blas_interface_get_size(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	size_t size;
			
 
				+	starpu_blas_interface_t *interface;
			
 
				+
			
 
				+	interface = &state->interface[0].blas;
			
 
				+
			
 
				+	size = interface->nx*interface->ny*interface->elemsize; 
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/* offer an access to the data parameters */
			
 
				+uint32_t starpu_get_blas_nx(data_state *state)
			
 
				+{
			
 
				+	return (state->interface[0].blas.nx);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_blas_ny(data_state *state)
			
 
				+{
			
 
				+	return (state->interface[0].blas.ny);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_blas_local_ld(data_state *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].blas.ld);
			
 
				+}
			
 
				+
			
 
				+uintptr_t starpu_get_blas_local_ptr(data_state *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].blas.ptr);
			
 
				+}
			
 
				+
			
 
				+/* memory allocation/deallocation primitives for the BLAS interface */
			
 
				+
			
 
				+/* returns the size of the allocated area */
			
 
				+size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node)
			
 
				+{
			
 
				+	uintptr_t addr = 0;
			
 
				+	unsigned fail = 0;
			
 
				+	size_t allocated_memory;
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+#endif
			
 
				+	uint32_t nx = state->interface[dst_node].blas.nx;
			
 
				+	uint32_t ny = state->interface[dst_node].blas.ny;
			
 
				+	size_t elemsize = state->interface[dst_node].blas.elemsize;
			
 
				+
			
 
				+	node_kind kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			addr = (uintptr_t)malloc(nx*ny*elemsize);
			
 
				+			if (!addr) 
			
 
				+				fail = 1;
			
 
				+
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			status = cublasAlloc(nx*ny, elemsize, (void **)&addr);
			
 
				+
			
 
				+			if (!addr || status != CUBLAS_STATUS_SUCCESS)
			
 
				+			{
			
 
				+				STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
			
 
				+				STARPU_ASSERT(status != CUBLAS_STATUS_NOT_INITIALIZED);
			
 
				+				STARPU_ASSERT(status != CUBLAS_STATUS_INVALID_VALUE);
			
 
				+				STARPU_ASSERT(status == CUBLAS_STATUS_ALLOC_FAILED);
			
 
				+				fail = 1;
			
 
				+			}
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+	if (!fail) {
			
 
				+		/* allocation succeeded */
			
 
				+		allocated_memory = nx*ny*elemsize;
			
 
				+
			
 
				+		/* update the data properly in consequence */
			
 
				+		state->interface[dst_node].blas.ptr = addr;
			
 
				+		state->interface[dst_node].blas.ld = nx;
			
 
				+	} else {
			
 
				+		/* allocation failed */
			
 
				+		allocated_memory = 0;
			
 
				+	}
			
 
				+	
			
 
				+	return allocated_memory;
			
 
				+}
			
 
				+
			
 
				+void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
			
 
				+{
			
 
				+#ifdef USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+#endif
			
 
				+
			
 
				+	node_kind kind = get_node_kind(node);
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void*)interface->blas.ptr);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			status = cublasFree((void*)interface->blas.ptr);
			
 
				+			
			
 
				+			STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
			
 
				+			STARPU_ASSERT(status == CUBLAS_STATUS_SUCCESS);
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_blas_interface_t *src_blas;
			
 
				+	starpu_blas_interface_t *dst_blas;
			
 
				+
			
 
				+	src_blas = &state->interface[src_node].blas;
			
 
				+	dst_blas = &state->interface[dst_node].blas;
			
 
				+
			
 
				+	cublasGetMatrix(src_blas->nx, src_blas->ny, src_blas->elemsize,
			
 
				+		(uint8_t *)src_blas->ptr, src_blas->ld,
			
 
				+		(uint8_t *)dst_blas->ptr, dst_blas->ld);
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
			
 
				+}
			
 
				+
			
 
				+static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_blas_interface_t *src_blas;
			
 
				+	starpu_blas_interface_t *dst_blas;
			
 
				+
			
 
				+	src_blas = &state->interface[src_node].blas;
			
 
				+	dst_blas = &state->interface[dst_node].blas;
			
 
				+
			
 
				+
			
 
				+	cublasSetMatrix(src_blas->nx, src_blas->ny, src_blas->elemsize,
			
 
				+		(uint8_t *)src_blas->ptr, src_blas->ld,
			
 
				+		(uint8_t *)dst_blas->ptr, dst_blas->ld);
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
			
 
				+}
			
 
				+#endif // USE_CUDA
			
 
				+
			
 
				+/* as not all platform easily have a BLAS lib installed ... */
			
 
				+static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	unsigned y;
			
 
				+	uint32_t nx = state->interface[dst_node].blas.nx;
			
 
				+	uint32_t ny = state->interface[dst_node].blas.ny;
			
 
				+	size_t elemsize = state->interface[dst_node].blas.elemsize;
			
 
				+
			
 
				+	uint32_t ld_src = state->interface[src_node].blas.ld;
			
 
				+	uint32_t ld_dst = state->interface[dst_node].blas.ld;
			
 
				+
			
 
				+	uintptr_t ptr_src = state->interface[src_node].blas.ptr;
			
 
				+	uintptr_t ptr_dst = state->interface[dst_node].blas.ptr;
			
 
				+
			
 
				+
			
 
				+	for (y = 0; y < ny; y++)
			
 
				+	{
			
 
				+		uint32_t src_offset = y*ld_src*elemsize;
			
 
				+		uint32_t dst_offset = y*ld_dst*elemsize;
			
 
				+
			
 
				+		memcpy((void *)(ptr_dst + dst_offset), 
			
 
				+			(void *)(ptr_src + src_offset), nx*elemsize);
			
 
				+	}
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	node_kind src_kind = get_node_kind(src_node);
			
 
				+	node_kind dst_kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch (dst_kind) {
			
 
				+	case RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> RAM */
			
 
				+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				+				 break;
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_RAM:
			
 
				+				/* CUBLAS_RAM -> RAM */
			
 
				+				if (get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					/* put a request to the corresponding GPU */
			
 
				+		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
			
 
				+					post_data_request(state, src_node, dst_node);
			
 
				+		//			fprintf(stderr, "post %p OK\n", state);
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+				printf("error node %d UNUSED\n", src_node);
			
 
				+			default:
			
 
				+				assert(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#ifdef USE_CUDA
			
 
				+	case CUDA_RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> CUBLAS_RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				+				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				+				break;
			
 
				+			case CUDA_RAM:
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO 
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				+	case SPU_LS:
			
 
				+		STARPU_ASSERT(0); // TODO
			
 
				+		break;
			
 
				+	case UNUSED:
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/src/datawizard/interfaces/blas_interface.h
+++ b/src/datawizard/interfaces/blas_interface.h
@@ -0,0 +1,24 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BLAS_INTERFACE_H__
			
 
				+#define __BLAS_INTERFACE_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#define BLAS_INTERFACE   0x118501
			
 
				+
			
 
				+#endif // __BLAS_INTERFACE_H__
			
--- a/src/datawizard/interfaces/csc_interface.h
+++ b/src/datawizard/interfaces/csc_interface.h
@@ -0,0 +1,24 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CSC_INTERFACE_H__
			
 
				+#define __CSC_INTERFACE_H__
			
 
				+
			
 
				+/* this interface is used for Sparse matrices */
			
 
				+
			
 
				+#define CSC_INTERFACE	0x118505
			
 
				+
			
 
				+#endif // __CSC_INTERFACE_H__
			
--- a/src/datawizard/interfaces/csr_filters.c
+++ b/src/datawizard/interfaces/csr_filters.c
@@ -0,0 +1,74 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "csr_filters.h"
			
 
				+#include "csr_interface.h"
			
 
				+
			
 
				+unsigned starpu_vertical_block_filter_func_csr(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	unsigned nchunks;
			
 
				+	uint32_t arg = f->filter_arg;
			
 
				+
			
 
				+	uint32_t nrow = root_data->interface[0].csr.nrow;
			
 
				+	size_t elemsize = root_data->interface[0].csr.elemsize;
			
 
				+	uint32_t firstentry = root_data->interface[0].csr.firstentry;
			
 
				+
			
 
				+	/* we will have arg chunks */
			
 
				+	nchunks = STARPU_MIN(nrow, arg);
			
 
				+	
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+	uint32_t chunk_size = (nrow + nchunks - 1)/nchunks;
			
 
				+
			
 
				+	/* XXX */
			
 
				+	STARPU_ASSERT(root_data->per_node[0].allocated);
			
 
				+	uint32_t *rowptr = root_data->interface[0].csr.rowptr;
			
 
				+
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t first_index = chunk*chunk_size - firstentry;
			
 
				+		uint32_t local_firstentry = rowptr[first_index];
			
 
				+
			
 
				+		uint32_t child_nrow = 
			
 
				+			STARPU_MIN(chunk_size, nrow - chunk*chunk_size);
			
 
				+
			
 
				+		uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_csr_interface_t *local = &root_data->children[chunk].interface[node].csr;
			
 
				+
			
 
				+			local->nnz = local_nnz;
			
 
				+			local->nrow = child_nrow;
			
 
				+			local->firstentry = local_firstentry;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				local->rowptr = &root_data->interface[node].csr.rowptr[first_index];
			
 
				+				local->colind = &root_data->interface[node].csr.colind[local_firstentry];
			
 
				+				float *nzval = (float *)(root_data->interface[node].csr.nzval);
			
 
				+				local->nzval = (uintptr_t)&nzval[local_firstentry];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nchunks;
			
 
				+}
			
--- a/src/datawizard/interfaces/csr_filters.h
+++ b/src/datawizard/interfaces/csr_filters.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CSR_FILTERS_H__
			
 
				+#define __CSR_FILTERS_H__
			
 
				+
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#endif // __CSR_FILTERS_H__
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -0,0 +1,451 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#include <common/hash.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
			
 
				+void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				+size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer);
			
 
				+int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+size_t csr_interface_get_size(struct starpu_data_state_t *state);
			
 
				+uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate);
			
 
				+
			
 
				+struct data_interface_ops_t interface_csr_ops = {
			
 
				+	.allocate_data_on_node = allocate_csr_buffer_on_node,
			
 
				+	.liberate_data_on_node = liberate_csr_buffer_on_node,
			
 
				+	.copy_data_1_to_1 = do_copy_csr_buffer_1_to_1,
			
 
				+	.dump_data_interface = dump_csr_interface,
			
 
				+	.get_size = csr_interface_get_size,
			
 
				+	.interfaceid = CSR_INTERFACE,
			
 
				+	.footprint = footprint_csr_interface_crc32
			
 
				+};
			
 
				+
			
 
				+/* declare a new data with the BLAS interface */
			
 
				+void starpu_monitor_csr_data(struct starpu_data_state_t **handle, uint32_t home_node,
			
 
				+		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
			
 
				+{
			
 
				+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	STARPU_ASSERT(handle);
			
 
				+	*handle = state;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_csr_interface_t *local_interface = &state->interface[node].csr;
			
 
				+
			
 
				+		if (node == home_node) {
			
 
				+			local_interface->nzval = nzval;
			
 
				+			local_interface->colind = colind;
			
 
				+			local_interface->rowptr = rowptr;
			
 
				+		}
			
 
				+		else {
			
 
				+			local_interface->nzval = 0;
			
 
				+			local_interface->colind = NULL;
			
 
				+			local_interface->rowptr = NULL;
			
 
				+		}
			
 
				+
			
 
				+		local_interface->nnz = nnz;
			
 
				+		local_interface->nrow = nrow;
			
 
				+		local_interface->firstentry = firstentry;
			
 
				+		local_interface->elemsize = elemsize;
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	state->ops = &interface_csr_ops;
			
 
				+
			
 
				+	monitor_new_data(state, home_node, 0);
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t footprint_csr_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	uint32_t hash;
			
 
				+
			
 
				+	hash = hstate;
			
 
				+	hash = hash_func(starpu_get_csr_nnz(state), hash);
			
 
				+
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	return footprint_csr_interface_generic(crc32_be, state, hstate);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct dumped_csr_interface_s {
			
 
				+	uint32_t nnz;
			
 
				+	uint32_t nrow;
			
 
				+	uintptr_t nzval;
			
 
				+	uint32_t *colind;
			
 
				+	uint32_t *rowptr;
			
 
				+	uint32_t firstentry;
			
 
				+	uint32_t elemsize;
			
 
				+}  __attribute__ ((packed));
			
 
				+
			
 
				+size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer)
			
 
				+{
			
 
				+	/* yes, that's DIRTY ... */
			
 
				+	struct dumped_csr_interface_s *buffer = _buffer;
			
 
				+
			
 
				+	buffer->nnz = (*interface).csr.nnz;
			
 
				+	buffer->nrow = (*interface).csr.nrow;
			
 
				+	buffer->nzval = (*interface).csr.nzval;
			
 
				+	buffer->colind = (*interface).csr.colind;
			
 
				+	buffer->rowptr = (*interface).csr.rowptr;
			
 
				+	buffer->firstentry = (*interface).csr.firstentry;
			
 
				+	buffer->elemsize = (*interface).csr.elemsize;
			
 
				+
			
 
				+	return (sizeof(struct dumped_csr_interface_s));
			
 
				+}
			
 
				+
			
 
				+/* offer an access to the data parameters */
			
 
				+uint32_t starpu_get_csr_nnz(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].csr.nnz);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_csr_nrow(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].csr.nrow);
			
 
				+}
			
 
				+
			
 
				+uint32_t starpu_get_csr_firstentry(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].csr.firstentry);
			
 
				+}
			
 
				+
			
 
				+size_t starpu_get_csr_elemsize(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	return (state->interface[0].csr.elemsize);
			
 
				+}
			
 
				+
			
 
				+uintptr_t starpu_get_csr_local_nzval(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].csr.nzval);
			
 
				+}
			
 
				+
			
 
				+uint32_t *starpu_get_csr_local_colind(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].csr.colind);
			
 
				+}
			
 
				+
			
 
				+uint32_t *starpu_get_csr_local_rowptr(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].csr.rowptr);
			
 
				+}
			
 
				+
			
 
				+size_t csr_interface_get_size(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	size_t size;
			
 
				+
			
 
				+	uint32_t nnz = starpu_get_csr_nnz(state);
			
 
				+	uint32_t nrow = starpu_get_csr_nrow(state);
			
 
				+	size_t elemsize = starpu_get_csr_elemsize(state);
			
 
				+
			
 
				+	size = nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/* memory allocation/deallocation primitives for the BLAS interface */
			
 
				+
			
 
				+/* returns the size of the allocated area */
			
 
				+size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node)
			
 
				+{
			
 
				+	uintptr_t addr_nzval;
			
 
				+	uint32_t *addr_colind, *addr_rowptr;
			
 
				+	size_t allocated_memory;
			
 
				+
			
 
				+	/* we need the 3 arrays to be allocated */
			
 
				+
			
 
				+	uint32_t nnz = state->interface[dst_node].csr.nnz;
			
 
				+	uint32_t nrow = state->interface[dst_node].csr.nrow;
			
 
				+	size_t elemsize = state->interface[dst_node].csr.elemsize;
			
 
				+
			
 
				+	node_kind kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			addr_nzval = (uintptr_t)malloc(nnz*elemsize);
			
 
				+			if (!addr_nzval)
			
 
				+				goto fail_nzval;
			
 
				+
			
 
				+			addr_colind = malloc(nnz*sizeof(uint32_t));
			
 
				+			if (!addr_colind)
			
 
				+				goto fail_colind;
			
 
				+
			
 
				+			addr_rowptr = malloc((nrow+1)*sizeof(uint32_t));
			
 
				+			if (!addr_rowptr)
			
 
				+				goto fail_rowptr;
			
 
				+
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasAlloc(nnz, elemsize, (void **)&addr_nzval);
			
 
				+			if (!addr_nzval)
			
 
				+				goto fail_nzval;
			
 
				+
			
 
				+			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
			
 
				+			if (!addr_colind)
			
 
				+				goto fail_colind;
			
 
				+
			
 
				+			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
			
 
				+			if (!addr_rowptr)
			
 
				+				goto fail_rowptr;
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+	/* allocation succeeded */
			
 
				+	allocated_memory = 
			
 
				+		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				+
			
 
				+	/* update the data properly in consequence */
			
 
				+	state->interface[dst_node].csr.nzval = addr_nzval;
			
 
				+	state->interface[dst_node].csr.colind = addr_colind;
			
 
				+	state->interface[dst_node].csr.rowptr = addr_rowptr;
			
 
				+	
			
 
				+	return allocated_memory;
			
 
				+
			
 
				+fail_rowptr:
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void *)addr_colind);
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)addr_colind);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+fail_colind:
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void *)addr_nzval);
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)addr_nzval);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+fail_nzval:
			
 
				+
			
 
				+	/* allocation failed */
			
 
				+	allocated_memory = 0;
			
 
				+
			
 
				+	return allocated_memory;
			
 
				+}
			
 
				+
			
 
				+void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
			
 
				+{
			
 
				+	node_kind kind = get_node_kind(node);
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void*)interface->csr.nzval);
			
 
				+			free((void*)interface->csr.colind);
			
 
				+			free((void*)interface->csr.rowptr);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)interface->csr.nzval);
			
 
				+			cublasFree((void*)interface->csr.colind);
			
 
				+			cublasFree((void*)interface->csr.rowptr);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr;
			
 
				+	starpu_csr_interface_t *dst_csr;
			
 
				+
			
 
				+	src_csr = &state->interface[src_node].csr;
			
 
				+	dst_csr = &state->interface[dst_node].csr;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cublasGetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
			
 
				+					(uint8_t *)dst_csr->nzval, 1);
			
 
				+
			
 
				+	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
			
 
				+						(uint8_t *)dst_csr->colind, 1);
			
 
				+
			
 
				+	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
			
 
				+						(uint8_t *)dst_csr->rowptr, 1);
			
 
				+	
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr;
			
 
				+	starpu_csr_interface_t *dst_csr;
			
 
				+
			
 
				+	src_csr = &state->interface[src_node].csr;
			
 
				+	dst_csr = &state->interface[dst_node].csr;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cublasSetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
			
 
				+					(uint8_t *)dst_csr->nzval, 1);
			
 
				+
			
 
				+	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
			
 
				+						(uint8_t *)dst_csr->colind, 1);
			
 
				+
			
 
				+	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
			
 
				+						(uint8_t *)dst_csr->rowptr, 1);
			
 
				+	
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+}
			
 
				+#endif // USE_CUDA
			
 
				+
			
 
				+/* as not all platform easily have a BLAS lib installed ... */
			
 
				+static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+
			
 
				+	starpu_csr_interface_t *src_csr;
			
 
				+	starpu_csr_interface_t *dst_csr;
			
 
				+
			
 
				+	src_csr = &state->interface[src_node].csr;
			
 
				+	dst_csr = &state->interface[dst_node].csr;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
			
 
				+
			
 
				+	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
			
 
				+
			
 
				+	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	node_kind src_kind = get_node_kind(src_node);
			
 
				+	node_kind dst_kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch (dst_kind) {
			
 
				+	case RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> RAM */
			
 
				+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				+				 break;
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_RAM:
			
 
				+				/* CUBLAS_RAM -> RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				if (get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					post_data_request(state, src_node, dst_node);
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+				printf("error node %d UNUSED\n", src_node);
			
 
				+			default:
			
 
				+				assert(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#ifdef USE_CUDA
			
 
				+	case CUDA_RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> CUBLAS_RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				+				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				+				break;
			
 
				+			case CUDA_RAM:
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO 
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				+	case SPU_LS:
			
 
				+		STARPU_ASSERT(0); // TODO
			
 
				+		break;
			
 
				+	case UNUSED:
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/src/datawizard/interfaces/csr_interface.h
+++ b/src/datawizard/interfaces/csr_interface.h
@@ -0,0 +1,26 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CSR_INTERFACE_H__
			
 
				+#define __CSR_INTERFACE_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+/* this interface is used for Sparse matrices */
			
 
				+
			
 
				+#define CSR_INTERFACE	0x118502
			
 
				+
			
 
				+#endif // __CSR_INTERFACE_H__
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -0,0 +1,57 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DATA_INTERFACE_H__
			
 
				+#define __DATA_INTERFACE_H__
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include "blas_interface.h"
			
 
				+#include "vector_interface.h"
			
 
				+#include "csr_interface.h"
			
 
				+#include "csc_interface.h"
			
 
				+#include "bcsr_interface.h"
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+/* to get the gordon_strideSize_t data structure from gordon */
			
 
				+#include <cell/gordon/gordon.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+struct starpu_data_state_t;
			
 
				+
			
 
				+struct data_interface_ops_t {
			
 
				+	size_t (*allocate_data_on_node)(struct starpu_data_state_t *state,
			
 
				+					uint32_t node);
			
 
				+	void (*liberate_data_on_node)(starpu_data_interface_t *interface,
			
 
				+					uint32_t node);
			
 
				+	int (*copy_data_1_to_1)(struct starpu_data_state_t *state, 
			
 
				+					uint32_t src, uint32_t dst);
			
 
				+	size_t (*dump_data_interface)(starpu_data_interface_t *interface, 
			
 
				+					void *buffer);
			
 
				+	size_t (*get_size)(struct starpu_data_state_t *state);
			
 
				+	uint32_t (*footprint)(struct starpu_data_state_t *state, uint32_t hstate);
			
 
				+	void (*display)(struct starpu_data_state_t *state, FILE *f);
			
 
				+#ifdef USE_GORDON
			
 
				+	int (*convert_to_gordon)(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+#endif
			
 
				+	/* an identifier that is unique to each interface */
			
 
				+	uint32_t interfaceid;
			
 
				+};
			
 
				+
			
 
				+#endif // __DATA_INTERFACE_H__
			
--- a/src/datawizard/interfaces/vector_filters.c
+++ b/src/datawizard/interfaces/vector_filters.c
@@ -0,0 +1,146 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "vector_filters.h"
			
 
				+#include "vector_interface.h"
			
 
				+
			
 
				+unsigned starpu_block_filter_func_vector(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	unsigned nchunks;
			
 
				+	uint32_t arg = f->filter_arg;
			
 
				+
			
 
				+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
			
 
				+	uint32_t nx = vector_root->nx;
			
 
				+	size_t elemsize = vector_root->elemsize;
			
 
				+
			
 
				+	/* we will have arg chunks */
			
 
				+	nchunks = STARPU_MIN(nx, arg);
			
 
				+
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t chunk_size = (nx + nchunks - 1)/nchunks;
			
 
				+		size_t offset = chunk*chunk_size*elemsize;
			
 
				+
			
 
				+		uint32_t child_nx = 
			
 
				+			STARPU_MIN(chunk_size, nx - chunk*chunk_size);
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_vector_interface_t *local = &root_data->children[chunk].interface[node].vector;
			
 
				+
			
 
				+			local->nx = child_nx;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				local->ptr = root_data->interface[node].vector.ptr + offset;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nchunks;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+unsigned starpu_divide_in_2_filter_func_vector(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	uint32_t length_first = f->filter_arg;
			
 
				+
			
 
				+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
			
 
				+	uint32_t nx = vector_root->nx;
			
 
				+	size_t elemsize = vector_root->elemsize;
			
 
				+
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(2, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	STARPU_ASSERT(length_first < nx);
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_vector_interface_t *local = &root_data->children[0].interface[node].vector;
			
 
				+
			
 
				+		local->nx = length_first;
			
 
				+		local->elemsize = elemsize;
			
 
				+
			
 
				+		if (root_data->per_node[node].allocated) {
			
 
				+			local->ptr = root_data->interface[node].vector.ptr;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_vector_interface_t *local = &root_data->children[1].interface[node].vector;
			
 
				+
			
 
				+		local->nx = nx - length_first;
			
 
				+		local->elemsize = elemsize;
			
 
				+
			
 
				+		if (root_data->per_node[node].allocated) {
			
 
				+			local->ptr = root_data->interface[node].vector.ptr + length_first*elemsize;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 2;
			
 
				+}
			
 
				+
			
 
				+unsigned starpu_list_filter_func_vector(starpu_filter *f, data_state *root_data)
			
 
				+{
			
 
				+	uint32_t nchunks = f->filter_arg;
			
 
				+	uint32_t *length_tab = f->filter_arg_ptr;
			
 
				+
			
 
				+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
			
 
				+	uint32_t nx = vector_root->nx;
			
 
				+	size_t elemsize = vector_root->elemsize;
			
 
				+
			
 
				+	/* first allocate the children data_state */
			
 
				+	root_data->children = calloc(nchunks, sizeof(data_state));
			
 
				+	STARPU_ASSERT(root_data->children);
			
 
				+
			
 
				+	unsigned current_pos = 0;
			
 
				+
			
 
				+	/* actually create all the chunks */
			
 
				+	unsigned chunk;
			
 
				+	for (chunk = 0; chunk < nchunks; chunk++)
			
 
				+	{
			
 
				+		uint32_t chunk_size = length_tab[chunk];
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < MAXNODES; node++)
			
 
				+		{
			
 
				+			starpu_vector_interface_t *local = &root_data->children[chunk].interface[node].vector;
			
 
				+
			
 
				+			local->nx = chunk_size;
			
 
				+			local->elemsize = elemsize;
			
 
				+
			
 
				+			if (root_data->per_node[node].allocated) {
			
 
				+				local->ptr = root_data->interface[node].vector.ptr + current_pos*elemsize;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		current_pos += chunk_size;
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT(current_pos == nx);
			
 
				+
			
 
				+	return nchunks;
			
 
				+}
			
--- a/src/datawizard/interfaces/vector_filters.h
+++ b/src/datawizard/interfaces/vector_filters.h
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __VECTOR_FILTERS_H__
			
 
				+#define __VECTOR_FILTERS_H__
			
 
				+
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#endif // __VECTOR_FILTERS_H__
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -0,0 +1,340 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/hierarchy.h>
			
 
				+
			
 
				+#include <common/hash.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node);
			
 
				+void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				+int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
			
 
				+size_t dump_vector_interface(starpu_data_interface_t *interface, void *buffer);
			
 
				+size_t vector_interface_get_size(struct starpu_data_state_t *state);
			
 
				+uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate);
			
 
				+void display_vector_interface(data_state *state, FILE *f);
			
 
				+#ifdef USE_GORDON
			
 
				+int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+#endif
			
 
				+
			
 
				+struct data_interface_ops_t interface_vector_ops = {
			
 
				+	.allocate_data_on_node = allocate_vector_buffer_on_node,
			
 
				+	.liberate_data_on_node = liberate_vector_buffer_on_node,
			
 
				+	.copy_data_1_to_1 = do_copy_vector_buffer_1_to_1,
			
 
				+	.dump_data_interface = dump_vector_interface,
			
 
				+	.get_size = vector_interface_get_size,
			
 
				+	.footprint = footprint_vector_interface_crc32,
			
 
				+#ifdef USE_GORDON
			
 
				+	.convert_to_gordon = convert_vector_to_gordon,
			
 
				+#endif
			
 
				+	.interfaceid = VECTOR_INTERFACE,
			
 
				+	.display = display_vector_interface
			
 
				+};
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+{
			
 
				+	STARPU_ASSERT(gordon_interface);
			
 
				+
			
 
				+	*ptr = (*interface).vector.ptr;
			
 
				+	(*ss).size = (*interface).vector.nx * (*interface).vector.elemsize;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* declare a new data with the BLAS interface */
			
 
				+void starpu_monitor_vector_data(struct starpu_data_state_t **handle, uint32_t home_node,
			
 
				+                        uintptr_t ptr, uint32_t nx, size_t elemsize)
			
 
				+{
			
 
				+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	STARPU_ASSERT(handle);
			
 
				+	*handle = state;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		starpu_vector_interface_t *local_interface = &state->interface[node].vector;
			
 
				+
			
 
				+		if (node == home_node) {
			
 
				+			local_interface->ptr = ptr;
			
 
				+		}
			
 
				+		else {
			
 
				+			local_interface->ptr = 0;
			
 
				+		}
			
 
				+
			
 
				+		local_interface->nx = nx;
			
 
				+		local_interface->elemsize = elemsize;
			
 
				+	}
			
 
				+
			
 
				+	state->ops = &interface_vector_ops;
			
 
				+
			
 
				+	monitor_new_data(state, home_node, 0);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline uint32_t footprint_vector_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	uint32_t hash;
			
 
				+
			
 
				+	hash = hstate;
			
 
				+	hash = hash_func(starpu_get_vector_nx(state), hash);
			
 
				+
			
 
				+	return hash;
			
 
				+}
			
 
				+
			
 
				+uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate)
			
 
				+{
			
 
				+	return footprint_vector_interface_generic(crc32_be, state, hstate);
			
 
				+}
			
 
				+
			
 
				+struct dumped_vector_interface_s {
			
 
				+	uintptr_t ptr;
			
 
				+	uint32_t nx;
			
 
				+	uint32_t elemsize;
			
 
				+} __attribute__ ((packed));
			
 
				+
			
 
				+void display_vector_interface(data_state *state, FILE *f)
			
 
				+{
			
 
				+	starpu_vector_interface_t *interface;
			
 
				+	interface =  &state->interface[0].vector;
			
 
				+
			
 
				+	fprintf(f, "%d\t", interface->nx);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+size_t dump_vector_interface(starpu_data_interface_t *interface, void *_buffer)
			
 
				+{
			
 
				+	/* yes, that's DIRTY ... */
			
 
				+	struct dumped_vector_interface_s *buffer = _buffer;
			
 
				+
			
 
				+	buffer->ptr = (*interface).vector.ptr;
			
 
				+	buffer->nx = (*interface).vector.nx;
			
 
				+	buffer->elemsize = (*interface).vector.elemsize;
			
 
				+
			
 
				+	return (sizeof(struct dumped_vector_interface_s));
			
 
				+}
			
 
				+
			
 
				+size_t vector_interface_get_size(struct starpu_data_state_t *state)
			
 
				+{
			
 
				+	size_t size;
			
 
				+	starpu_vector_interface_t *interface;
			
 
				+
			
 
				+	interface =  &state->interface[0].vector;
			
 
				+
			
 
				+	size = interface->nx*interface->elemsize;
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/* offer an access to the data parameters */
			
 
				+uint32_t starpu_get_vector_nx(data_state *state)
			
 
				+{
			
 
				+	return (state->interface[0].vector.nx);
			
 
				+}
			
 
				+
			
 
				+uintptr_t starpu_get_vector_local_ptr(data_state *state)
			
 
				+{
			
 
				+	unsigned node;
			
 
				+	node = get_local_memory_node();
			
 
				+
			
 
				+	STARPU_ASSERT(state->per_node[node].allocated);
			
 
				+
			
 
				+	return (state->interface[node].vector.ptr);
			
 
				+}
			
 
				+
			
 
				+/* memory allocation/deallocation primitives for the vector interface */
			
 
				+
			
 
				+/* returns the size of the allocated area */
			
 
				+size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node)
			
 
				+{
			
 
				+	uintptr_t addr = 0;
			
 
				+	size_t allocated_memory;
			
 
				+
			
 
				+	uint32_t nx = state->interface[dst_node].vector.nx;
			
 
				+	size_t elemsize = state->interface[dst_node].vector.elemsize;
			
 
				+
			
 
				+	node_kind kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			addr = (uintptr_t)malloc(nx*elemsize);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasAlloc(nx, elemsize, (void **)&addr);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+
			
 
				+	if (addr) {
			
 
				+		/* allocation succeeded */
			
 
				+		allocated_memory = nx*elemsize;
			
 
				+
			
 
				+		/* update the data properly in consequence */
			
 
				+		state->interface[dst_node].vector.ptr = addr;
			
 
				+	} else {
			
 
				+		/* allocation failed */
			
 
				+		allocated_memory = 0;
			
 
				+	}
			
 
				+	
			
 
				+	return allocated_memory;
			
 
				+}
			
 
				+
			
 
				+void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
			
 
				+{
			
 
				+	node_kind kind = get_node_kind(node);
			
 
				+	switch(kind) {
			
 
				+		case RAM:
			
 
				+			free((void*)interface->vector.ptr);
			
 
				+			break;
			
 
				+#ifdef USE_CUDA
			
 
				+		case CUDA_RAM:
			
 
				+			cublasFree((void*)interface->vector.ptr);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_CUDA
			
 
				+static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = &state->interface[src_node].vector;
			
 
				+	dst_vector = &state->interface[dst_node].vector;
			
 
				+
			
 
				+	cublasGetVector(src_vector->nx, src_vector->elemsize,
			
 
				+		(uint8_t *)src_vector->ptr, 1,
			
 
				+		(uint8_t *)dst_vector->ptr, 1);
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+}
			
 
				+
			
 
				+static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	starpu_vector_interface_t *src_vector;
			
 
				+	starpu_vector_interface_t *dst_vector;
			
 
				+
			
 
				+	src_vector = &state->interface[src_node].vector;
			
 
				+	dst_vector = &state->interface[dst_node].vector;
			
 
				+
			
 
				+	cublasSetVector(src_vector->nx, src_vector->elemsize,
			
 
				+		(uint8_t *)src_vector->ptr, 1,
			
 
				+		(uint8_t *)dst_vector->ptr, 1);
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+}
			
 
				+#endif // USE_CUDA
			
 
				+
			
 
				+static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	uint32_t nx = state->interface[dst_node].vector.nx;
			
 
				+	size_t elemsize = state->interface[dst_node].vector.elemsize;
			
 
				+
			
 
				+	uintptr_t ptr_src = state->interface[src_node].vector.ptr;
			
 
				+	uintptr_t ptr_dst = state->interface[dst_node].vector.ptr;
			
 
				+
			
 
				+	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
			
 
				+
			
 
				+	TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
			
 
				+}
			
 
				+
			
 
				+int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	node_kind src_kind = get_node_kind(src_node);
			
 
				+	node_kind dst_kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch (dst_kind) {
			
 
				+	case RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> RAM */
			
 
				+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				+				 break;
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_RAM:
			
 
				+				/* CUBLAS_RAM -> RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				if (get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					/* put a request to the corresponding GPU */
			
 
				+					post_data_request(state, src_node, dst_node);
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+				printf("error node %d UNUSED\n", src_node);
			
 
				+			default:
			
 
				+				assert(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#ifdef USE_CUDA
			
 
				+	case CUDA_RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> CUBLAS_RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				+				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				+				break;
			
 
				+			case CUDA_RAM:
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO 
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				+	case SPU_LS:
			
 
				+		STARPU_ASSERT(0); // TODO
			
 
				+		break;
			
 
				+	case UNUSED:
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/src/datawizard/interfaces/vector_interface.h
+++ b/src/datawizard/interfaces/vector_interface.h
@@ -0,0 +1,24 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __VECTOR_INTERFACE_H__
			
 
				+#define __VECTOR_INTERFACE_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#define VECTOR_INTERFACE   0x118503
			
 
				+
			
 
				+#endif // __VECTOR_INTERFACE_H__
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -0,0 +1,537 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "memalloc.h"
			
 
				+#include <datawizard/footprint.h>
			
 
				+
			
 
				+extern mem_node_descr descr;
			
 
				+static starpu_mutex mc_mutex[MAXNODES]; 
			
 
				+static mem_chunk_list_t mc_list[MAXNODES];
			
 
				+static mem_chunk_list_t mc_list_to_free[MAXNODES];
			
 
				+
			
 
				+void init_mem_chunk_lists(void)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < MAXNODES; i++)
			
 
				+	{
			
 
				+		init_mutex(&mc_mutex[i]);
			
 
				+		mc_list[i] = mem_chunk_list_new();
			
 
				+		mc_list_to_free[i] = mem_chunk_list_new();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void lock_all_subtree(data_state *data)
			
 
				+{
			
 
				+	if (data->nchildren == 0)
			
 
				+	{
			
 
				+		/* this is a leaf */	
			
 
				+		while (take_mutex_try(&data->header_lock))
			
 
				+			datawizard_progress(get_local_memory_node());
			
 
				+	}
			
 
				+	else {
			
 
				+		/* lock all sub-subtrees children */
			
 
				+		int child;
			
 
				+		for (child = 0; child < data->nchildren; child++)
			
 
				+		{
			
 
				+			lock_all_subtree(&data->children[child]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void unlock_all_subtree(data_state *data)
			
 
				+{
			
 
				+	if (data->nchildren == 0)
			
 
				+	{
			
 
				+		/* this is a leaf */	
			
 
				+		release_mutex(&data->header_lock);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* lock all sub-subtrees children */
			
 
				+		int child;
			
 
				+		for (child = data->nchildren - 1; child >= 0; child--)
			
 
				+		{
			
 
				+			unlock_all_subtree(&data->children[child]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned may_free_subtree(data_state *data, unsigned node)
			
 
				+{
			
 
				+	if (data->nchildren == 0)
			
 
				+	{
			
 
				+		/* we only free if no one refers to the leaf */
			
 
				+		uint32_t refcnt = get_data_refcnt(data, node);
			
 
				+		return (refcnt == 0);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* lock all sub-subtrees children */
			
 
				+		int child;
			
 
				+		for (child = 0; child < data->nchildren; child++)
			
 
				+		{
			
 
				+			unsigned res;
			
 
				+			res = may_free_subtree(&data->children[child], node);
			
 
				+			if (!res) return 0;
			
 
				+		}
			
 
				+
			
 
				+		/* no problem was found */
			
 
				+		return 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+size_t do_free_mem_chunk(mem_chunk_t mc, unsigned node)
			
 
				+{
			
 
				+	size_t size;
			
 
				+
			
 
				+	/* free the actual buffer */
			
 
				+	size = liberate_memory_on_node(mc, node);
			
 
				+
			
 
				+	/* remove the mem_chunk from the list */
			
 
				+	mem_chunk_list_erase(mc_list[node], mc);
			
 
				+	mem_chunk_delete(mc);
			
 
				+
			
 
				+	return size; 
			
 
				+}
			
 
				+
			
 
				+void transfer_subtree_to_node(data_state *data, unsigned src_node, 
			
 
				+						unsigned dst_node)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned last = 0;
			
 
				+	unsigned cnt;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (data->nchildren == 0)
			
 
				+	{
			
 
				+		/* this is a leaf */
			
 
				+		switch(data->per_node[src_node].state) {
			
 
				+		case OWNER:
			
 
				+			/* the local node has the only copy */
			
 
				+			/* the owner is now the destination_node */
			
 
				+			data->per_node[src_node].state = INVALID;
			
 
				+			data->per_node[dst_node].state = OWNER;
			
 
				+
			
 
				+			ret = driver_copy_data_1_to_1(data, src_node, dst_node, 0);
			
 
				+			STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+			break;
			
 
				+		case SHARED:
			
 
				+			/* some other node may have the copy */
			
 
				+			data->per_node[src_node].state = INVALID;
			
 
				+
			
 
				+			/* count the number of copies */
			
 
				+			cnt = 0;
			
 
				+			for (i = 0; i < MAXNODES; i++)
			
 
				+			{
			
 
				+				if (data->per_node[i].state == SHARED) {
			
 
				+					cnt++; 
			
 
				+					last = i;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (cnt == 1)
			
 
				+				data->per_node[last].state = OWNER;
			
 
				+
			
 
				+			break;
			
 
				+		case INVALID:
			
 
				+			/* nothing to be done */
			
 
				+			break;
			
 
				+		default:
			
 
				+			STARPU_ASSERT(0);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	else {
			
 
				+		/* lock all sub-subtrees children */
			
 
				+		int child;
			
 
				+		for (child = 0; child < data->nchildren; child++)
			
 
				+		{
			
 
				+			transfer_subtree_to_node(&data->children[child],
			
 
				+							src_node, dst_node);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static size_t try_to_free_mem_chunk(mem_chunk_t mc, unsigned node, unsigned attempts)
			
 
				+{
			
 
				+	size_t liberated = 0;
			
 
				+
			
 
				+	data_state *data;
			
 
				+
			
 
				+	data = mc->data;
			
 
				+
			
 
				+	STARPU_ASSERT(data);
			
 
				+
			
 
				+	if (attempts == 0)
			
 
				+	{
			
 
				+		/* this is the first attempt to free memory
			
 
				+		   so we avoid to drop requested memory */
			
 
				+		/* TODO */
			
 
				+	}
			
 
				+
			
 
				+	/* try to lock all the leafs of the subtree */
			
 
				+	lock_all_subtree(data);
			
 
				+
			
 
				+	/* check if they are all "free" */
			
 
				+	if (may_free_subtree(data, node))
			
 
				+	{
			
 
				+		/* in case there was nobody using that buffer, throw it 
			
 
				+		 * away after writing it back to main memory */
			
 
				+		transfer_subtree_to_node(data, node, 0);
			
 
				+
			
 
				+		/* now the actual buffer may be liberated */
			
 
				+		liberated = do_free_mem_chunk(mc, node);
			
 
				+	}
			
 
				+
			
 
				+	/* unlock the leafs */
			
 
				+	unlock_all_subtree(data);
			
 
				+
			
 
				+	return liberated;
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_ALLOCATION_CACHE
			
 
				+/* we assume that mc_mutex[node] is taken */
			
 
				+static void reuse_mem_chunk(unsigned node, data_state *new_data, mem_chunk_t mc, unsigned is_already_in_mc_list)
			
 
				+{
			
 
				+	data_state *old_data;
			
 
				+	old_data = mc->data;
			
 
				+
			
 
				+	/* we found an appropriate mem chunk: so we get it out
			
 
				+	 * of the "to free" list, and reassign it to the new
			
 
				+	 * piece of data */
			
 
				+
			
 
				+	if (!is_already_in_mc_list)
			
 
				+	{
			
 
				+		mem_chunk_list_erase(mc_list_to_free[node], mc);
			
 
				+	}
			
 
				+
			
 
				+	if (!mc->data_was_deleted)
			
 
				+	{
			
 
				+		old_data->per_node[node].allocated = 0;
			
 
				+		old_data->per_node[node].automatically_allocated = 0;
			
 
				+	}
			
 
				+
			
 
				+	new_data->per_node[node].allocated = 1;
			
 
				+	new_data->per_node[node].automatically_allocated = 1;
			
 
				+
			
 
				+	memcpy(&new_data->interface[node], &mc->interface, sizeof(starpu_data_interface_t));
			
 
				+
			
 
				+	mc->data = new_data;
			
 
				+	mc->data_was_deleted = 0;
			
 
				+	/* mc->ops, mc->size, mc->footprint and mc->interface should be
			
 
				+ 	 * unchanged ! */
			
 
				+	
			
 
				+	/* reinsert the mem chunk in the list of active memory chunks */
			
 
				+	if (!is_already_in_mc_list)
			
 
				+	{
			
 
				+		mem_chunk_list_push_front(mc_list[node], mc);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static unsigned try_to_reuse_mem_chunk(mem_chunk_t mc, unsigned node, data_state *new_data, unsigned is_already_in_mc_list)
			
 
				+{
			
 
				+	unsigned success = 0;
			
 
				+
			
 
				+	data_state *old_data;
			
 
				+
			
 
				+	old_data = mc->data;
			
 
				+
			
 
				+	STARPU_ASSERT(old_data);
			
 
				+
			
 
				+	/* try to lock all the leafs of the subtree */
			
 
				+	lock_all_subtree(old_data);
			
 
				+
			
 
				+	/* check if they are all "free" */
			
 
				+	if (may_free_subtree(old_data, node))
			
 
				+	{
			
 
				+		success = 1;
			
 
				+
			
 
				+		/* in case there was nobody using that buffer, throw it 
			
 
				+		 * away after writing it back to main memory */
			
 
				+		transfer_subtree_to_node(old_data, node, 0);
			
 
				+
			
 
				+		/* now replace the previous data */
			
 
				+		reuse_mem_chunk(node, new_data, mc, is_already_in_mc_list);
			
 
				+	}
			
 
				+
			
 
				+	/* unlock the leafs */
			
 
				+	unlock_all_subtree(old_data);
			
 
				+
			
 
				+	return success;
			
 
				+}
			
 
				+
			
 
				+/* this function looks for a memory chunk that matches a given footprint in the
			
 
				+ * list of mem chunk that need to be liberated */
			
 
				+static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data, uint32_t footprint)
			
 
				+{
			
 
				+	take_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+	/* go through all buffers for which there was a removal request */
			
 
				+	mem_chunk_t mc, next_mc;
			
 
				+	for (mc = mem_chunk_list_begin(mc_list_to_free[node]);
			
 
				+	     mc != mem_chunk_list_end(mc_list_to_free[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		next_mc = mem_chunk_list_next(mc);
			
 
				+
			
 
				+		if (mc->footprint == footprint)
			
 
				+		{
			
 
				+
			
 
				+			data_state *old_data;
			
 
				+			old_data = mc->data;
			
 
				+
			
 
				+			if (old_data->per_node[node].allocated &&
			
 
				+					old_data->per_node[node].automatically_allocated)
			
 
				+			{
			
 
				+				reuse_mem_chunk(node, data, mc, 0);
			
 
				+
			
 
				+				release_mutex(&mc_mutex[node]);
			
 
				+				return 1;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	/* now look for some non essential data in the active list */
			
 
				+	for (mc = mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		/* there is a risk that the memory chunk is liberated 
			
 
				+		   before next iteration starts: so we compute the next
			
 
				+		   element of the list now */
			
 
				+		next_mc = mem_chunk_list_next(mc);
			
 
				+
			
 
				+		if (mc->data->is_not_important && (mc->footprint == footprint))
			
 
				+		{
			
 
				+//			fprintf(stderr, "found a candidate ...\n");
			
 
				+			if (try_to_reuse_mem_chunk(mc, node, data, 1))
			
 
				+			{
			
 
				+				release_mutex(&mc_mutex[node]);
			
 
				+				return 1;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 
			
 
				+ * Try to free some memory on the specified node
			
 
				+ * 	returns 0 if no memory was released, 1 else
			
 
				+ */
			
 
				+static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)), unsigned attempts)
			
 
				+{
			
 
				+//	fprintf(stderr, "reclaim memory...\n");
			
 
				+
			
 
				+	size_t liberated = 0;
			
 
				+
			
 
				+	take_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+	/* remove all buffers for which there was a removal request */
			
 
				+	mem_chunk_t mc, next_mc;
			
 
				+	for (mc = mem_chunk_list_begin(mc_list_to_free[node]);
			
 
				+	     mc != mem_chunk_list_end(mc_list_to_free[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		next_mc = mem_chunk_list_next(mc);
			
 
				+
			
 
				+		liberated += liberate_memory_on_node(mc, node);
			
 
				+
			
 
				+		mem_chunk_list_erase(mc_list_to_free[node], mc);
			
 
				+
			
 
				+		mem_chunk_delete(mc);
			
 
				+	}
			
 
				+
			
 
				+	/* try to free all allocated data potentially in use .. XXX */
			
 
				+	for (mc = mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		/* there is a risk that the memory chunk is liberated 
			
 
				+		   before next iteration starts: so we compute the next
			
 
				+		   element of the list now */
			
 
				+		next_mc = mem_chunk_list_next(mc);
			
 
				+
			
 
				+		liberated += try_to_free_mem_chunk(mc, node, attempts);
			
 
				+		#if 0
			
 
				+		if (liberated > toreclaim)
			
 
				+			break;
			
 
				+		#endif
			
 
				+	}
			
 
				+
			
 
				+//	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
			
 
				+
			
 
				+	release_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+	return liberated;
			
 
				+}
			
 
				+
			
 
				+static void register_mem_chunk(data_state *state, uint32_t dst_node, size_t size, unsigned automatically_allocated)
			
 
				+{
			
 
				+	mem_chunk_t mc = mem_chunk_new();
			
 
				+
			
 
				+	STARPU_ASSERT(state);
			
 
				+	STARPU_ASSERT(state->ops);
			
 
				+
			
 
				+	mc->data = state;
			
 
				+	mc->size = size; 
			
 
				+	mc->footprint = compute_data_footprint(state);
			
 
				+	mc->ops = state->ops;
			
 
				+	mc->data_was_deleted = 0;
			
 
				+	mc->automatically_allocated = automatically_allocated;
			
 
				+
			
 
				+	/* the interface was already filled by ops->allocate_data_on_node */
			
 
				+	memcpy(&mc->interface, &state->interface[dst_node], sizeof(starpu_data_interface_t));
			
 
				+
			
 
				+	take_mutex(&mc_mutex[dst_node]);
			
 
				+	mem_chunk_list_push_front(mc_list[dst_node], mc);
			
 
				+	release_mutex(&mc_mutex[dst_node]);
			
 
				+}
			
 
				+
			
 
				+void request_mem_chunk_removal(data_state *state, unsigned node)
			
 
				+{
			
 
				+	take_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+	/* iterate over the list of memory chunks and remove the entry */
			
 
				+	mem_chunk_t mc, next_mc;
			
 
				+	for (mc = mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		next_mc = mem_chunk_list_next(mc);
			
 
				+
			
 
				+		if (mc->data == state) {
			
 
				+			/* we found the data */
			
 
				+			mc->data_was_deleted = 1;
			
 
				+
			
 
				+			/* remove it from the main list */
			
 
				+			mem_chunk_list_erase(mc_list[node], mc);
			
 
				+
			
 
				+			/* put it in the list of buffers to be removed */
			
 
				+			mem_chunk_list_push_front(mc_list_to_free[node], mc);
			
 
				+
			
 
				+			release_mutex(&mc_mutex[node]);
			
 
				+
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* there was no corresponding buffer ... */
			
 
				+
			
 
				+	release_mutex(&mc_mutex[node]);
			
 
				+}
			
 
				+
			
 
				+size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node)
			
 
				+{
			
 
				+	size_t liberated = 0;
			
 
				+
			
 
				+	STARPU_ASSERT(mc->ops);
			
 
				+	STARPU_ASSERT(mc->ops->liberate_data_on_node);
			
 
				+
			
 
				+	if (mc->automatically_allocated)
			
 
				+	{
			
 
				+		mc->ops->liberate_data_on_node(&mc->interface, node);
			
 
				+
			
 
				+		if (!mc->data_was_deleted)
			
 
				+		{
			
 
				+			data_state *state = mc->data;
			
 
				+
			
 
				+			state->per_node[node].allocated = 0;
			
 
				+
			
 
				+			/* XXX why do we need that ? */
			
 
				+			state->per_node[node].automatically_allocated = 0;
			
 
				+		}
			
 
				+
			
 
				+		liberated = mc->size;
			
 
				+	}
			
 
				+
			
 
				+	return liberated;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * In order to allocate a piece of data, we try to reuse existing buffers if
			
 
				+ * its possible.
			
 
				+ *	1 - we try to reuse a memchunk that is explicitely unused.
			
 
				+ *	2 - we go through the list of memory chunks and find one that is not
			
 
				+ *	referenced and that has the same footprint to reuse it.
			
 
				+ *	3 - we call the usual driver's alloc method
			
 
				+ *	4 - we go through the list of memory chunks and release those that are
			
 
				+ *	not referenced (or part of those).
			
 
				+ *
			
 
				+ */
			
 
				+int allocate_memory_on_node(data_state *state, uint32_t dst_node)
			
 
				+{
			
 
				+	unsigned attempts = 0;
			
 
				+	size_t allocated_memory;
			
 
				+
			
 
				+	STARPU_ASSERT(state);
			
 
				+
			
 
				+	data_allocation_inc_stats(dst_node);
			
 
				+
			
 
				+#ifdef USE_ALLOCATION_CACHE
			
 
				+	/* perhaps we can directly reuse a buffer in the free-list */
			
 
				+	uint32_t footprint = compute_data_footprint(state);
			
 
				+
			
 
				+	TRACE_START_ALLOC_REUSE(dst_node);
			
 
				+	if (try_to_find_reusable_mem_chunk(dst_node, state, footprint))
			
 
				+	{
			
 
				+		allocation_cache_hit(dst_node);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	TRACE_END_ALLOC_REUSE(dst_node);
			
 
				+#endif
			
 
				+
			
 
				+	do {
			
 
				+		STARPU_ASSERT(state->ops);
			
 
				+		STARPU_ASSERT(state->ops->allocate_data_on_node);
			
 
				+
			
 
				+		TRACE_START_ALLOC(dst_node);
			
 
				+		allocated_memory = state->ops->allocate_data_on_node(state, dst_node);
			
 
				+		TRACE_END_ALLOC(dst_node);
			
 
				+
			
 
				+		if (!allocated_memory) {
			
 
				+			/* XXX perhaps we should find the proper granularity 
			
 
				+			 * not to waste our cache all the time */
			
 
				+			STARPU_ASSERT(state->ops->get_size);
			
 
				+			size_t data_size = state->ops->get_size(state);
			
 
				+
			
 
				+			TRACE_START_MEMRECLAIM(dst_node);
			
 
				+			reclaim_memory(dst_node, 2*data_size, attempts);
			
 
				+			TRACE_END_MEMRECLAIM(dst_node);
			
 
				+		}
			
 
				+		
			
 
				+	} while(!allocated_memory && attempts++ < 2);
			
 
				+
			
 
				+	/* perhaps we could really not handle that capacity misses */
			
 
				+	if (!allocated_memory)
			
 
				+		goto nomem;
			
 
				+
			
 
				+	register_mem_chunk(state, dst_node, allocated_memory, 1);
			
 
				+
			
 
				+	state->per_node[dst_node].allocated = 1;
			
 
				+	state->per_node[dst_node].automatically_allocated = 1;
			
 
				+
			
 
				+	return 0;
			
 
				+nomem:
			
 
				+	STARPU_ASSERT(!allocated_memory);
			
 
				+	return -ENOMEM;
			
 
				+}
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -0,0 +1,51 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MEMALLOC_H__
			
 
				+#define __MEMALLOC_H__
			
 
				+
			
 
				+#include <common/list.h>
			
 
				+#include <datawizard/interfaces/data_interface.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <datawizard/copy-driver.h>
			
 
				+#include <datawizard/progress.h>
			
 
				+
			
 
				+struct starpu_data_state_t;
			
 
				+
			
 
				+LIST_TYPE(mem_chunk,
			
 
				+	struct starpu_data_state_t *data;
			
 
				+	size_t size;
			
 
				+
			
 
				+	uint32_t footprint;
			
 
				+	
			
 
				+	/* The footprint of the data is not sufficient to determine whether two
			
 
				+	 * pieces of data have the same layout (there could be collision in the
			
 
				+	 * hash function ...) so we still keep a copy of the actual layout (ie.
			
 
				+	 * the starpu_data_interface_t) to stay on the safe side. We make a copy of
			
 
				+	 * because when a data is deleted, the memory chunk remains.
			
 
				+	 */
			
 
				+	struct data_interface_ops_t *ops;
			
 
				+	starpu_data_interface_t interface;
			
 
				+	unsigned automatically_allocated;
			
 
				+	unsigned data_was_deleted;
			
 
				+);
			
 
				+
			
 
				+void init_mem_chunk_lists(void);
			
 
				+void request_mem_chunk_removal(struct starpu_data_state_t *state, unsigned node);
			
 
				+int allocate_memory_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
			
 
				+size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node);
			
 
				+
			
 
				+#endif
			
--- a/src/datawizard/progress.c
+++ b/src/datawizard/progress.c
@@ -0,0 +1,49 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <datawizard/progress.h>
			
 
				+#include <datawizard/data_request.h>
			
 
				+
			
 
				+extern pthread_key_t local_workers_key;
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+extern void handle_terminated_job_per_worker(struct worker_s *worker);
			
 
				+extern struct starpu_mutex_t terminated_list_mutexes[32]; 
			
 
				+#endif
			
 
				+
			
 
				+void datawizard_progress(uint32_t memory_node)
			
 
				+{
			
 
				+	/* in case some other driver requested data */
			
 
				+	handle_node_data_requests(memory_node);
			
 
				+
			
 
				+#ifdef USE_GORDON
			
 
				+	/* XXX quick and dirty !! */
			
 
				+	struct worker_set_s *set;
			
 
				+	set = pthread_getspecific(local_workers_key);
			
 
				+	if (set) {
			
 
				+		/* make the corresponding workers progress */
			
 
				+		unsigned worker;
			
 
				+		for (worker = 0; worker < set->nworkers; worker++)
			
 
				+		{
			
 
				+			take_mutex(&terminated_list_mutexes[0]);
			
 
				+			handle_terminated_job_per_worker(&set->workers[worker]);
			
 
				+			release_mutex(&terminated_list_mutexes[0]);
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
--- a/src/datawizard/progress.h
+++ b/src/datawizard/progress.h
@@ -0,0 +1,24 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_PROGRESS_H__
			
 
				+#define __DW_PROGRESS_H__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+void datawizard_progress(uint32_t memory_node);
			
 
				+
			
 
				+#endif
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -0,0 +1,76 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <datawizard/write_back.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+
			
 
				+void write_through_data(data_state *state, uint32_t requesting_node, 
			
 
				+					   uint32_t write_through_mask)
			
 
				+{
			
 
				+	if ((write_through_mask & ~(1<<requesting_node)) == 0) {
			
 
				+		/* nothing will be done ... */
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	while (take_mutex_try(&state->header_lock))
			
 
				+		datawizard_progress(requesting_node);
			
 
				+
			
 
				+	/* first commit all changes onto the nodes specified by the mask */
			
 
				+	uint32_t node;
			
 
				+	for (node = 0; node < MAXNODES; node++)
			
 
				+	{
			
 
				+		if (write_through_mask & (1<<node)) {
			
 
				+			/* we need to commit the buffer on that node */
			
 
				+			if (node != requesting_node) 
			
 
				+			{
			
 
				+				/* the requesting node already has the data by
			
 
				+				 * definition */
			
 
				+				int ret;
			
 
				+				ret = driver_copy_data_1_to_1(state, 
			
 
				+						requesting_node, node, 0);
			
 
				+
			
 
				+				/* there must remain memory on the write-through mask to honor the request */
			
 
				+				if (ret)
			
 
				+					STARPU_ASSERT(0);
			
 
				+			}
			
 
				+				
			
 
				+			/* now the data is shared among the nodes on the
			
 
				+			 * write_through_mask */
			
 
				+			state->per_node[node].state = SHARED;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* the requesting node is now one sharer */
			
 
				+	if (write_through_mask & ~(1<<requesting_node))
			
 
				+	{
			
 
				+		state->per_node[requesting_node].state = SHARED;
			
 
				+	}
			
 
				+
			
 
				+	release_mutex(&state->header_lock);
			
 
				+}
			
 
				+
			
 
				+void data_set_wb_mask(data_state *data, uint32_t wb_mask)
			
 
				+{
			
 
				+	data->wb_mask = wb_mask;
			
 
				+
			
 
				+	/* in case the data has some children, set their wb_mask as well */
			
 
				+	if (data->nchildren > 0) 
			
 
				+	{
			
 
				+		int child;
			
 
				+		for (child = 0; child < data->nchildren; child++)
			
 
				+			data_set_wb_mask(&data->children[child], wb_mask);
			
 
				+	}
			
 
				+}
			
--- a/src/datawizard/write_back.h
+++ b/src/datawizard/write_back.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_WRITE_BACK_H__
			
 
				+#define __DW_WRITE_BACK_H__
			
 
				+
			
 
				+#include <datawizard/coherency.h>
			
 
				+
			
 
				+void write_through_data(data_state *state, uint32_t requesting_node, 
			
 
				+					   uint32_t write_through_mask);
			
 
				+void data_set_wb_mask(data_state *state, uint32_t wb_mask);
			
 
				+
			
 
				+
			
 
				+#endif // __DW_WRITE_BACK_H__
			
--- a/src/drivers/core/Makefile
+++ b/src/drivers/core/Makefile
@@ -0,0 +1,31 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+CFLAGS += -I../../
			
 
				+
			
 
				+OBJS := driver_core.o
			
 
				+
			
 
				+all: $(OBJS)
			
 
				+
			
 
				+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
			
 
				+%.d: %.c
			
 
				+	$(CC) $(CFLAGS) $< -MM -o $*.d
			
 
				+
			
 
				+-include $(OBJS:.o=.d)
			
 
				+endif
			
 
				+
			
 
				+clean:
			
 
				+	@rm -f *.o *.d *.gcno *.gcda 
			
--- a/src/drivers/core/driver_core.c
+++ b/src/drivers/core/driver_core.c
@@ -0,0 +1,157 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "driver_core.h"
			
 
				+#include <core/policies/sched_policy.h>
			
 
				+
			
 
				+int execute_job_on_core(job_t j, struct worker_s *core_args)
			
 
				+{
			
 
				+	int ret;
			
 
				+	tick_t codelet_start, codelet_end;
			
 
				+	tick_t codelet_start_comm, codelet_end_comm;
			
 
				+
			
 
				+	unsigned calibrate_model = 0;
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	STARPU_ASSERT(task->cl);
			
 
				+	STARPU_ASSERT(task->cl->core_func);
			
 
				+
			
 
				+	if (task->cl->model && task->cl->model->benchmarking)
			
 
				+		calibrate_model = 1;
			
 
				+
			
 
				+	if (calibrate_model || BENCHMARK_COMM)
			
 
				+		GET_TICK(codelet_start_comm);
			
 
				+
			
 
				+	ret = fetch_codelet_input(task->buffers, task->interface,
			
 
				+			task->cl->nbuffers, 0);
			
 
				+
			
 
				+	if (calibrate_model || BENCHMARK_COMM)
			
 
				+		GET_TICK(codelet_end_comm);
			
 
				+
			
 
				+	if (ret != 0) {
			
 
				+		/* there was not enough memory so the codelet cannot be executed right now ... */
			
 
				+		/* push the codelet back and try another one ... */
			
 
				+		return STARPU_TRYAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	TRACE_START_CODELET_BODY(j);
			
 
				+
			
 
				+	if (calibrate_model || BENCHMARK_COMM)
			
 
				+		GET_TICK(codelet_start);
			
 
				+
			
 
				+	cl_func func = task->cl->core_func;
			
 
				+	func(task->interface, task->cl_arg);
			
 
				+	
			
 
				+	if (calibrate_model || BENCHMARK_COMM)
			
 
				+		GET_TICK(codelet_end);
			
 
				+
			
 
				+	TRACE_END_CODELET_BODY(j);
			
 
				+
			
 
				+	push_codelet_output(task->buffers, task->cl->nbuffers, 0);
			
 
				+
			
 
				+//#ifdef MODEL_DEBUG
			
 
				+	if (calibrate_model || BENCHMARK_COMM)
			
 
				+	{
			
 
				+		double measured = timing_delay(&codelet_start, &codelet_end);
			
 
				+		double measured_comm = timing_delay(&codelet_start_comm, &codelet_end_comm);
			
 
				+
			
 
				+//		fprintf(stderr, "%d\t%d\n", (int)j->penality, (int)measured_comm);
			
 
				+		core_args->jobq->total_computation_time += measured;
			
 
				+		core_args->jobq->total_communication_time += measured_comm;
			
 
				+
			
 
				+		if (calibrate_model)
			
 
				+			update_perfmodel_history(j, core_args->arch, measured);
			
 
				+	}
			
 
				+//#endif
			
 
				+
			
 
				+	return STARPU_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+void *core_worker(void *arg)
			
 
				+{
			
 
				+	struct worker_s *core_arg = arg;
			
 
				+
			
 
				+#ifdef USE_FXT
			
 
				+	fxt_register_thread(core_arg->bindid);
			
 
				+#endif
			
 
				+	TRACE_NEW_WORKER(FUT_CORE_KEY, core_arg->memory_node);
			
 
				+
			
 
				+#ifndef DONTBIND
			
 
				+	/* fix the thread on the correct cpu */
			
 
				+	cpu_set_t aff_mask; 
			
 
				+	CPU_ZERO(&aff_mask);
			
 
				+	CPU_SET(core_arg->bindid, &aff_mask);
			
 
				+	sched_setaffinity(0, sizeof(aff_mask), &aff_mask);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef VERBOSE
			
 
				+        fprintf(stderr, "core worker %d is ready on logical core %d\n", core_arg->id, core_arg->bindid);
			
 
				+#endif
			
 
				+
			
 
				+	set_local_memory_node_key(&core_arg->memory_node);
			
 
				+
			
 
				+	set_local_queue(core_arg->jobq);
			
 
				+
			
 
				+	/* this is only useful (and meaningful) is there is a single
			
 
				+	   memory node "related" to that queue */
			
 
				+	core_arg->jobq->memory_node = core_arg->memory_node;
			
 
				+
			
 
				+	core_arg->jobq->total_computation_time = 0.0;
			
 
				+	core_arg->jobq->total_communication_time = 0.0;
			
 
				+	
			
 
				+        /* tell the main thread that we are ready */
			
 
				+        sem_post(&core_arg->ready_sem);
			
 
				+
			
 
				+        job_t j;
			
 
				+	int res;
			
 
				+
			
 
				+	while (machine_is_running())
			
 
				+	{
			
 
				+                j = pop_task();
			
 
				+                if (j == NULL) continue;
			
 
				+
			
 
				+		/* can a core perform that task ? */
			
 
				+		if (!CORE_MAY_PERFORM(j)) 
			
 
				+		{
			
 
				+			/* put it and the end of the queue ... XXX */
			
 
				+			push_task(j);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+                res = execute_job_on_core(j, core_arg);
			
 
				+		if (res != STARPU_SUCCESS) {
			
 
				+			switch (res) {
			
 
				+				case STARPU_FATAL:
			
 
				+					assert(0);
			
 
				+				case STARPU_TRYAGAIN:
			
 
				+					push_task(j);
			
 
				+					continue;
			
 
				+				default: 
			
 
				+					assert(0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		handle_job_termination(j);
			
 
				+        }
			
 
				+
			
 
				+#ifdef DATA_STATS
			
 
				+	fprintf(stderr, "CORE #%d computation %le comm %le (%lf \%%)\n", core_arg->id, core_arg->jobq->total_computation_time, core_arg->jobq->total_communication_time,  core_arg->jobq->total_communication_time*100.0/core_arg->jobq->total_computation_time);
			
 
				+#endif
			
 
				+
			
 
				+	TRACE_WORKER_TERMINATED(FUT_CORE_KEY);
			
 
				+
			
 
				+	pthread_exit(NULL);
			
 
				+}
			
--- a/src/drivers/core/driver_core.h
+++ b/src/drivers/core/driver_core.h
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVER_CORE_H__
			
 
				+#define __DRIVER_CORE_H__
			
 
				+
			
 
				+/* to bind threads onto a given cpu */
			
 
				+#define _GNU_SOURCE
			
 
				+#include <sched.h>
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <core/jobs.h>
			
 
				+
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <common/fxt.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+void *core_worker(void *);
			
 
				+
			
 
				+#ifndef NMAXCORES
			
 
				+#define NMAXCORES       4
			
 
				+#endif
			
 
				+
			
 
				+#endif //  __DRIVER_CORE_H__
			
--- a/src/drivers/cuda/Makefile
+++ b/src/drivers/cuda/Makefile