11 years ago · 596effeffa
--- a/ChangeLog
+++ b/ChangeLog
@@ -54,6 +54,8 @@ New features:
 
																     cudaMalloc overhead.
															
 
																   * Prefetching is now done for all schedulers when it can be done whatever
															
 
																     the scheduling decision.
															
 
																+  * Add a watchdog which permits to easily trigger a crash when StarPU gets
															
 
																+    stuck.
															
 
																 Small features:
															
 
																   * New functions starpu_data_acquire_cb_sequential_consistency() and
															
@@ -89,6 +91,10 @@ Changes:
 
																   * Function starpu_sched_ctx_create() now takes a variable argument
															
 
																     list to define the scheduler to be used, and the minimum and
															
 
																     maximum priority values
															
 
																+  * The functions starpu_sched_set/get_min/max_priority set/get the
															
 
																+    priorities of the current scheduling context, i.e the one which
															
 
																+    was set by a call to starpu_sched_ctx_set_context() or the initial
															
 
																+    context if the function was not called yet.
															
 
																 StarPU 1.1.0 (svn revision xxxx)
															
 
																 ==============================================
															
--- a/configure.ac
+++ b/configure.ac
@@ -1536,8 +1536,8 @@ AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
 
																 AC_MSG_CHECKING(whether allocation cache should be used)
															
 
																-AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--enable-allocation-cache],
															
 
																-			[enable data allocation cache])],
															
 
																+AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--disable-allocation-cache],
															
 
																+			[disable data allocation cache])],
															
 
																 			enable_allocation_cache=$enableval, enable_allocation_cache=yes)
															
 
																 AC_MSG_RESULT($enable_allocation_cache)
															
 
																 if test x$enable_allocation_cache = xyes; then
															
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -28,3 +28,8 @@ EXTRA_DIST =    tutorial/hello_world.c \
 
																 txtdir = ${docdir}/tutorial
															
 
																 txt_DATA = $(EXTRA_DIST)
															
 
																+
															
 
																+showcheck:
															
 
																+	for i in $(SUBDIRS) ; do \
															
 
																+		make -C $$i showcheck ; \
															
 
																+	done
															
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -347,7 +347,7 @@ tasks with varying size so that the regression can be computed. StarPU will not
 
																 trust the regression unless there is at least 10% difference between the minimum
															
 
																 and maximum observed input size. It can be useful to set the
															
 
																 environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
															
 
																-on varying input sizes with \ref STARPU_SCHED set to <c>eager</c> scheduler,
															
 
																+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
															
 
																 so as to feed the performance model for a variety of
															
 
																 inputs. The application can also provide the measurements explictly by
															
 
																 using the function starpu_perfmodel_update_history(). The tools
															
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -124,7 +124,9 @@ Set the scheduling context the subsequent tasks will be submitted to
 
																 \fn unsigned starpu_sched_ctx_get_context(void)
															
 
																 \ingroup API_Scheduling_Contexts
															
 
																-Return the scheduling context the tasks are currently submitted to
															
 
																+Return the scheduling context the tasks are currently submitted to,
															
 
																+or ::STARPU_NMAX_SCHED_CTXS if no default context has been defined
															
 
																+by calling the function starpu_sched_ctx_set_context().
															
 
																 \fn void starpu_sched_ctx_stop_task_submission(void)
															
 
																 \ingroup API_Scheduling_Contexts
															
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -76,6 +76,7 @@ block and wake up all workers.
 
																 \fn int starpu_sched_set_min_priority(int min_prio)
															
 
																 \ingroup API_Scheduling_Policy
															
 
																+TODO: check if this is correct
															
 
																 Defines the minimum task priority level supported by the scheduling
															
 
																 policy. The default minimum priority level is the same as the default
															
 
																 priority level which is 0 by convention. The application may access
															
@@ -86,6 +87,7 @@ application.
 
																 \fn int starpu_sched_set_max_priority(int max_prio)
															
 
																 \ingroup API_Scheduling_Policy
															
 
																+TODO: check if this is correct
															
 
																 Defines the maximum priority level supported by the scheduling policy.
															
 
																 The default maximum priority level is 1. The application may access
															
 
																 that value by calling the function starpu_sched_get_max_priority().
															
@@ -95,11 +97,13 @@ application.
 
																 \fn int starpu_sched_get_min_priority(void)
															
 
																 \ingroup API_Scheduling_Policy
															
 
																+TODO: check if this is correct
															
 
																 Returns the current minimum priority level supported by the scheduling
															
 
																 policy
															
 
																 \fn int starpu_sched_get_max_priority(void)
															
 
																 \ingroup API_Scheduling_Policy
															
 
																+TODO: check if this is correct
															
 
																 Returns the current maximum priority level supported by the scheduling
															
 
																 policy
															
--- a/doc/doxygen/chapters/environment_variables.doxy
+++ b/doc/doxygen/chapters/environment_variables.doxy
@@ -257,7 +257,8 @@ Disable asynchronous copies between CPU and MIC devices.
 
																 <dd>
															
 
																 \anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
															
 
																 \addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
															
 
																-Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
															
 
																+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
															
 
																+through RAM. The default is Enabled.
															
 
																 This permits to test the performance effect of GPU-Direct.
															
 
																 </dd>
															
@@ -548,6 +549,25 @@ When set to 0, data statistics will not be displayed at the
 
																 end of the execution of an application (\ref DataStatistics).
															
 
																 </dd>
															
 
																+<dt>STARPU_WATCHDOG_TIMEOUT</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_WATCHDOG_TIMEOUT
															
 
																+\addindex __env__STARPU_WATCHDOG_TIMEOUT
															
 
																+When set to a value other than 0, allows to make StarPU print an error
															
 
																+message whenever StarPU does not terminate any task for 10ms. Should
															
 
																+be used in combination with \ref STARPU_WATCHDOG_CRASH (see \ref
															
 
																+DetectionStuckConditions).
															
 
																+</dd>
															
 
																+
															
 
																+<dt>STARPU_WATCHDOG_CRASH</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_WATCHDOG_CRASH
															
 
																+\addindex __env__STARPU_WATCHDOG_CRASH
															
 
																+When set to a value other than 0, it triggers a crash when the watch
															
 
																+dog is reached, thus allowing to catch the situation in gdb, etc
															
 
																+(see \ref DetectionStuckConditions)
															
 
																+</dd>
															
 
																+
															
 
																 </dl>
															
 
																 \section ConfiguringTheHypervisor Configuring The Hypervisor
															
--- a/doc/doxygen/chapters/optimize_performance.doxy
+++ b/doc/doxygen/chapters/optimize_performance.doxy
@@ -382,6 +382,23 @@ Statistics on the execution can then be obtained by using <c>export
 
																 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
															
 
																  More details on performance feedback are provided by the next chapter.
															
 
																+\section DetectionStuckConditions Detection Stuck Conditions
															
 
																+
															
 
																+It may happen that for some reason, StarPU does not make progress for a long
															
 
																+period of time.  Reason are sometimes due to contention inside StarPU, but
															
 
																+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
															
 
																+driver, etc.
															
 
																+
															
 
																+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
															
 
																+
															
 
																+allows to make StarPU print an error message whenever StarPU does not terminate
															
 
																+any task for 10ms. In addition to that,
															
 
																+
															
 
																+<c>export STARPU_WATCHDOG_CRASH=1</c>
															
 
																+
															
 
																+triggers a crash in that condition, thus allowing to catch the situation in gdb
															
 
																+etc.
															
 
																+
															
 
																 \section CUDA-specificOptimizations CUDA-specific Optimizations
															
 
																 Due to CUDA limitations, StarPU will have a hard time overlapping its own
															
--- a/doc/doxygen/chapters/tips_and_tricks.doxy
+++ b/doc/doxygen/chapters/tips_and_tricks.doxy
@@ -15,7 +15,7 @@ may run on the machine. For instance, a C++ computation class which is not
 
																 thread-safe by itself, but for which several instanciated objects of that class
															
 
																 can be used concurrently. This can be used in StarPU by initializing one such
															
 
																 object per worker. For instance, the libstarpufft example does the following to
															
 
																-be able to use FFTW.
															
 
																+be able to use FFTW on CPUs.
															
 
																 Some global array stores the instanciated objects:
															
@@ -49,21 +49,23 @@ static void fft(void *descr[], void *_args)
 
																 }
															
 
																 \endcode
															
 
																-Another way to go which may be needed is to execute some code from the workers
															
 
																-themselves thanks to starpu_execute_on_each_worker(). This may be required
															
 
																-by CUDA to behave properly due to threading issues. For instance, StarPU's
															
 
																-starpu_cublas_init() looks like the following to call
															
 
																-<c>cublasInit</c> from the workers themselves:
															
 
																+This however is not sufficient for FFT on CUDA: initialization has
															
 
																+to be done from the workers themselves.  This can be done thanks to
															
 
																+starpu_execute_on_each_worker().  For instance libstarpufft does the following.
															
 
																 \code{.c}
															
 
																-static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
															
 
																+static void fft_plan_gpu(void *args)
															
 
																 {
															
 
																-    cublasStatus cublasst = cublasInit();
															
 
																-    cublasSetKernelStream(starpu_cuda_get_local_stream());
															
 
																+    plan plan = args;
															
 
																+    int n2 = plan->n2[0];
															
 
																+    int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+    cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
															
 
																+    cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
															
 
																 }
															
 
																-void starpu_cublas_init(void)
															
 
																+void starpufft_plan(void)
															
 
																 {
															
 
																-    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
															
 
																+    starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
															
 
																 }
															
 
																 \endcode
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -53,9 +53,6 @@ typedef UINT_PTR uintptr_t;
 
																 #include <starpu_worker.h>
															
 
																 #include <starpu_task.h>
															
 
																 #include <starpu_task_list.h>
															
 
																-#ifdef BUILDING_STARPU
															
 
																-#include <util/starpu_task_list_inline.h>
															
 
																-#endif
															
 
																 #include <starpu_task_util.h>
															
 
																 #include <starpu_sched_ctx.h>
															
 
																 #include <starpu_expert.h>
															
--- a/include/starpu_bitmap.h
+++ b/include/starpu_bitmap.h
@@ -44,4 +44,6 @@ int starpu_bitmap_first(struct starpu_bitmap *);
 
																 int starpu_bitmap_last(struct starpu_bitmap *);
															
 
																 //return the index of bit right after e, -1 if none
															
 
																 int starpu_bitmap_next(struct starpu_bitmap *, int e);
															
 
																+int starpu_bitmap_has_next(struct starpu_bitmap * b, int e);
															
 
																+
															
 
																 #endif
															
--- a/include/starpu_sched_ctx_hypervisor.h
+++ b/include/starpu_sched_ctx_hypervisor.h
@@ -29,7 +29,8 @@ struct starpu_sched_ctx_performance_counters
 
																 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
															
 
																 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker);
															
 
																 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
															
 
																-	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag);
															
 
																+	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag,
															
 
																+				      int nready_tasks, double nready_flops);
															
 
																 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
															
 
																 	void (*notify_ready_task)(unsigned sched_ctx_id, struct starpu_task *task);
															
 
																 	void (*notify_empty_ctx)(unsigned sched_ctx_id, struct starpu_task *task);
															
--- a/include/starpu_task_list.h
+++ b/include/starpu_task_list.h
@@ -18,6 +18,7 @@
 
																 #define __STARPU_TASK_LIST_H__
															
 
																 #include <starpu_task.h>
															
 
																+#include <starpu_util.h>
															
 
																 #ifdef __cplusplus
															
 
																 extern "C"
															
@@ -30,25 +31,134 @@ struct starpu_task_list
 
																 	struct starpu_task *tail;
															
 
																 };
															
 
																-/* If we are building starpu and not using gnu inline, we have to avoid
															
 
																- * declaring the functions extern, as in that case the compiler will compile
															
 
																- * the inline into all .o files! */
															
 
																-#if !defined(BUILDING_STARPU) || defined(__GNUC_GNU_INLINE__)
															
 
																-
															
 
																-void starpu_task_list_init(struct starpu_task_list *list);
															
 
																-void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
															
 
																-void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
															
 
																-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list);
															
 
																-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list);
															
 
																-int starpu_task_list_empty(struct starpu_task_list *list);
															
 
																-void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task);
															
 
																-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
															
 
																-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
															
 
																-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list);
															
 
																-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list);
															
 
																-struct starpu_task *starpu_task_list_next(struct starpu_task *task);
															
 
																+static STARPU_INLINE
															
 
																+void starpu_task_list_init(struct starpu_task_list *list)
															
 
																+{
															
 
																+	list->head = NULL;
															
 
																+	list->tail = NULL;
															
 
																+}
															
 
																-#endif
															
 
																+static STARPU_INLINE
															
 
																+void starpu_task_list_push_front(struct starpu_task_list *list,
															
 
																+				struct starpu_task *task)
															
 
																+{
															
 
																+	if (list->tail == NULL)
															
 
																+	{
															
 
																+		list->tail = task;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		list->head->prev = task;
															
 
																+	}
															
 
																+
															
 
																+	task->prev = NULL;
															
 
																+	task->next = list->head;
															
 
																+	list->head = task;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+void starpu_task_list_push_back(struct starpu_task_list *list,
															
 
																+				struct starpu_task *task)
															
 
																+{
															
 
																+	if (list->head == NULL)
															
 
																+	{
															
 
																+		list->head = task;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		list->tail->next = task;
															
 
																+	}
															
 
																+
															
 
																+	task->next = NULL;
															
 
																+	task->prev = list->tail;
															
 
																+	list->tail = task;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
															
 
																+{
															
 
																+	return list->head;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
															
 
																+{
															
 
																+	return list->tail;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+int starpu_task_list_empty(struct starpu_task_list *list)
															
 
																+{
															
 
																+	return (list->head == NULL);
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+void starpu_task_list_erase(struct starpu_task_list *list,
															
 
																+				struct starpu_task *task)
															
 
																+{
															
 
																+	struct starpu_task *p = task->prev;
															
 
																+
															
 
																+	if (p)
															
 
																+	{
															
 
																+		p->next = task->next;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		list->head = task->next;
															
 
																+	}
															
 
																+
															
 
																+	if (task->next)
															
 
																+	{
															
 
																+		task->next->prev = p;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		list->tail = p;
															
 
																+	}
															
 
																+
															
 
																+	task->prev = NULL;
															
 
																+	task->next = NULL;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
															
 
																+{
															
 
																+	struct starpu_task *task = list->head;
															
 
																+
															
 
																+	if (task)
															
 
																+		starpu_task_list_erase(list, task);
															
 
																+
															
 
																+	return task;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
															
 
																+{
															
 
																+	struct starpu_task *task = list->tail;
															
 
																+
															
 
																+	if (task)
															
 
																+		starpu_task_list_erase(list, task);
															
 
																+
															
 
																+	return task;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
															
 
																+{
															
 
																+	return list->head;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+	return NULL;
															
 
																+}
															
 
																+
															
 
																+static STARPU_INLINE
															
 
																+struct starpu_task *starpu_task_list_next(struct starpu_task *task)
															
 
																+{
															
 
																+	return task->next;
															
 
																+}
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/include/starpu_thread.h
+++ b/include/starpu_thread.h
@@ -160,7 +160,9 @@ typedef int starpu_pthread_rwlockattr_t;
 
																 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr);
															
 
																 int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock);
															
 
																 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
															
 
																+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
															
 
																+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 #elif !defined(_MSC_VER) /* STARPU_SIMGRID */
															
@@ -172,7 +174,9 @@ typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
 
																 #define starpu_pthread_rwlock_destroy pthread_rwlock_destroy
															
 
																 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
															
 
																+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
															
 
																+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
															
 
																 #endif /* STARPU_SIMGRID, _MSC_VER */
															
--- a/include/starpu_thread_util.h
+++ b/include/starpu_thread_util.h
@@ -18,7 +18,8 @@
 
																 #ifndef __STARPU_THREAD_UTIL_H__
															
 
																 #define __STARPU_THREAD_UTIL_H__
															
 
																-#include <starpu.h>
															
 
																+#include <starpu_util.h>
															
 
																+#include <errno.h>
															
 
																 /*
															
 
																  * Encapsulation of the starpu_pthread_create_* functions.
															
@@ -78,6 +79,21 @@
 
																 	}                                                                      \
															
 
																 } while (0)
															
 
																+#define STARPU_PTHREAD_MUTEX_TRYLOCK(mutex) \
															
 
																+	_STARPU_PTHREAD_MUTEX_TRYLOCK(mutex, __FILE__, __LINE__)
															
 
																+static STARPU_INLINE
															
 
																+int _STARPU_PTHREAD_MUTEX_TRYLOCK(starpu_pthread_mutex_t *mutex, char *file, int line)
															
 
																+{
															
 
																+	int p_ret = starpu_pthread_mutex_trylock(mutex);
															
 
																+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
															
 
																+		fprintf(stderr,
															
 
																+			"%s:%d starpu_pthread_mutex_trylock: %s\n",
															
 
																+			file, line, strerror(p_ret));
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 #define STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
															
 
																 	int p_ret = starpu_pthread_mutex_unlock(mutex);                        \
															
 
																 	if (STARPU_UNLIKELY(p_ret)) {                                          \
															
@@ -143,6 +159,21 @@
 
																 	}                                                                      \
															
 
																 } while (0)
															
 
																+#define STARPU_PTHREAD_RWLOCK_TRYRDLOCK(rwlock) \
															
 
																+	_starpu_pthread_rwlock_tryrdlock(rwlock, __FILE__, __LINE__)
															
 
																+static STARPU_INLINE
															
 
																+int _starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock, char *file, int line)
															
 
																+{
															
 
																+	int p_ret = starpu_pthread_rwlock_tryrdlock(rwlock);
															
 
																+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
															
 
																+		fprintf(stderr,
															
 
																+			"%s:%d starpu_pthread_rwlock_tryrdlock: %s\n",
															
 
																+			file, line, strerror(p_ret));
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 #define STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) do {                              \
															
 
																 	int p_ret = starpu_pthread_rwlock_wrlock(rwlock);                      \
															
 
																 	if (STARPU_UNLIKELY(p_ret)) {                                          \
															
@@ -153,6 +184,21 @@
 
																 	}                                                                      \
															
 
																 } while (0)
															
 
																+#define STARPU_PTHREAD_RWLOCK_TRYWRLOCK(rwlock) \
															
 
																+	_starpu_pthread_rwlock_trywrlock(rwlock, __FILE__, __LINE__)
															
 
																+static STARPU_INLINE
															
 
																+int _starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock, char *file, int line)
															
 
																+{
															
 
																+	int p_ret = starpu_pthread_rwlock_trywrlock(rwlock);
															
 
																+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
															
 
																+		fprintf(stderr,
															
 
																+			"%s:%d starpu_pthread_rwlock_trywrlock: %s\n",
															
 
																+			file, line, strerror(p_ret));
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 #define STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) do {                              \
															
 
																 	int p_ret = starpu_pthread_rwlock_unlock(rwlock);                      \
															
 
																 	if (STARPU_UNLIKELY(p_ret)) {                                          \
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -57,6 +57,16 @@ extern "C"
 
																 #  define STARPU_ATTRIBUTE_ALIGNED(size)
															
 
																 #endif
															
 
																+/* Note that if we're compiling C++, then just use the "inline"
															
 
																+   keyword, since it's part of C++ */
															
 
																+#if defined(c_plusplus) || defined(__cplusplus)
															
 
																+#  define STARPU_INLINE inline
															
 
																+#elif defined(_MSC_VER) || defined(__HP_cc)
															
 
																+#  define STARPU_INLINE __inline
															
 
																+#else
															
 
																+#  define STARPU_INLINE __inline__
															
 
																+#endif
															
 
																+
															
 
																 #if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API) && !defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
															
 
																 #define STARPU_DEPRECATED  __attribute__((__deprecated__))
															
 
																 #else
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -25,6 +25,7 @@
 
																 #include <common/config.h>
															
 
																 #include <common/thread.h>
															
 
																+static void _starpu_mpi_add_sync_point_in_fxt(void);
															
 
																 static void _starpu_mpi_submit_new_mpi_request(void *arg);
															
 
																 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
															
 
																 #ifdef STARPU_VERBOSE
															
@@ -1248,6 +1249,19 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 #endif //STARPU_USE_FXT
															
 
																 	}
															
 
																+	_starpu_mpi_add_sync_point_in_fxt();
															
 
																+	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
															
 
																+	_starpu_mpi_cache_init(MPI_COMM_WORLD);
															
 
																+
															
 
																+	{
															
 
																+		int nb_nodes, k;
															
 
																+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
															
 
																+		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
															
 
																+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
															
 
																+		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
															
 
																+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
															
 
																+	}
															
 
																+
															
 
																 	/* notify the main thread that the progression thread is ready */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	running = 1;
															
@@ -1534,30 +1548,17 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
																 	argc_argv->argc = argc;
															
 
																 	argc_argv->argv = argv;
															
 
																-	STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																-	while (!running)
															
 
																-		STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-
															
 
																 #ifdef STARPU_MPI_ACTIVITY
															
 
																 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
															
 
																 	STARPU_ASSERT_MSG(hookid >= 0, "starpu_progression_hook_register failed");
															
 
																 #endif /* STARPU_MPI_ACTIVITY */
															
 
																-	_starpu_mpi_add_sync_point_in_fxt();
															
 
																-	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
															
 
																-	_starpu_mpi_cache_init(MPI_COMM_WORLD);
															
 
																+	STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
															
 
																-	{
															
 
																-		int nb_nodes, k;
															
 
																-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
															
 
																-		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
															
 
																-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
															
 
																-		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
															
 
																-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
															
 
																-	}
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	while (!running)
															
 
																+		STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -27,6 +27,7 @@ struct _callback_arg
 
																 	int count;
															
 
																 };
															
 
																+static
															
 
																 void _callback_collective(void *arg)
															
 
																 {
															
 
																 	struct _callback_arg *callback_arg = arg;
															
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -27,6 +27,7 @@
 
																 #include <core/task.h>
															
 
																 #include <starpu_mpi_private.h>
															
 
																+#include <starpu_mpi_task_insert.h>
															
 
																 /* Whether we are allowed to keep copies of remote data. */
															
 
																 struct _starpu_data_entry
															
@@ -64,6 +65,7 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
																 	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
															
 
																 }
															
 
																+static
															
 
																 void _starpu_mpi_cache_empty_tables(int world_size)
															
 
																 {
															
 
																 	int i;
															
@@ -308,6 +310,7 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
																 	}
															
 
																 }
															
 
																+static
															
 
																 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute, MPI_Comm comm)
															
 
																 {
															
 
																 	if (_cache_enabled)
															
@@ -363,6 +366,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
																 	}
															
 
																 }
															
 
																+static
															
 
																 int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_list varg_list)
															
 
																 {
															
 
																 	int arg_type;
															
@@ -857,6 +861,7 @@ void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[],
 
																 {
															
 
																 }
															
 
																+static
															
 
																 struct starpu_codelet _starpu_mpi_redux_data_read_cl =
															
 
																 {
															
 
																 	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
															
@@ -877,6 +882,7 @@ struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
 
																 	.name = "_starpu_mpi_redux_data_write_cl"
															
 
																 };
															
 
																+static
															
 
																 void _starpu_mpi_redux_data_detached_callback(void *arg)
															
 
																 {
															
 
																 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
															
@@ -889,6 +895,7 @@ void _starpu_mpi_redux_data_detached_callback(void *arg)
 
																 	free(args);
															
 
																 }
															
 
																+static
															
 
																 void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
															
 
																 {
															
 
																 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
															
--- a/sc_hypervisor/include/sc_hypervisor_monitoring.h
+++ b/sc_hypervisor/include/sc_hypervisor_monitoring.h
@@ -47,6 +47,10 @@ struct sc_hypervisor_wrapper
 
																 	/* user configuration meant to limit resizing */
															
 
																 	struct sc_hypervisor_policy_config *config;
															
 
																+
															
 
																+	/* the start time of the resizing sample of the workers of this context*/
															
 
																+	double start_time_w[STARPU_NMAXWORKERS];
															
 
																+
															
 
																 	/* idle time of workers in this context */
															
 
																 	double current_idle_time[STARPU_NMAXWORKERS];
															
@@ -121,6 +125,10 @@ struct sc_hypervisor_wrapper
 
																 	/* boolean indicating that a context is being sized */
															
 
																 	unsigned to_be_sized;
															
 
																+
															
 
																+	/* boolean indicating if we add the idle of this worker to 
															
 
																+	   the idle of the context */
															
 
																+	unsigned compute_idle[STARPU_NMAXWORKERS];
															
 
																 };
															
 
																 /* return the wrapper of context that saves its monitoring information */
															
--- a/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
@@ -24,8 +24,8 @@ int resize_no = 0;
 
																 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
															
 
																 {
															
 
																 	/* for vite */
															
 
																-	printf("resize_no = %d\n", resize_no);
															
 
																-	starpu_trace_user_event(resize_no++);
															
 
																+/* 	printf("resize_no = %d\n", resize_no); */
															
 
																+/* 	starpu_trace_user_event(resize_no++); */
															
 
																 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
															
 
																 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
															
 
																 	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;
															
--- a/sc_hypervisor/src/policies_utils/lp_tools.c
+++ b/sc_hypervisor/src/policies_utils/lp_tools.c
@@ -729,13 +729,20 @@ void sc_hypervisor_lp_share_remaining_resources(int ns, unsigned *sched_ctxs,  i
 
																 			}
															
 
																 		}
															
 
																 		if(!found)
															
 
																+		{
															
 
																 			remaining_workers[nw++] = worker;
															
 
																+		}
															
 
																 	}
															
 
																 	if(nw > 0)
															
 
																+	{
															
 
																 		for(s = 0; s < ns; s++)
															
 
																-			sc_hypervisor_add_workers_to_sched_ctx(remaining_workers, nw, sched_ctxs[s]);		
															
 
																-
															
 
																+		{
															
 
																+			for(w = 0; w < nw; w++)
															
 
																+				_sc_hypervisor_allow_compute_idle(sched_ctxs[s], remaining_workers[w], 0);
															
 
																+			sc_hypervisor_add_workers_to_sched_ctx(remaining_workers, nw, sched_ctxs[s]);
															
 
																+		}		
															
 
																+	}
															
 
																 }
															
 
																 double sc_hypervisor_lp_find_tmax(double t1, double t2)
															
--- a/sc_hypervisor/src/policies_utils/speed.c
+++ b/sc_hypervisor/src/policies_utils/speed.c
@@ -138,7 +138,7 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 
																 		{
															
 
																 			worker = workers->get_next(workers, &it);
															
 
																 			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
															
 
																-			if(arch == req_arch)
															
 
																+			if(arch == req_arch && sc_w->compute_idle[worker])
															
 
																 			{
															
 
																 				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
															
 
																 				if(max_workers_idle_time < sc_w->idle_time[worker])
															
--- a/sc_hypervisor/src/sc_hypervisor.c
+++ b/sc_hypervisor/src/sc_hypervisor.c
@@ -24,7 +24,8 @@ struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
																 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
															
 
																 static void notify_pushed_task(unsigned sched_ctx, int worker);
															
 
																-static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag);
															
 
																+static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, 
															
 
																+				  int hypervisor_tag, int nready_tasks, double ready_flops);
															
 
																 static void notify_poped_task(unsigned sched_ctx, int  worker);
															
 
																 static void notify_submitted_job(struct starpu_task *task, unsigned footprint, size_t data_size);
															
 
																 static void notify_ready_task(unsigned sched_ctx, struct starpu_task *task);
															
@@ -196,6 +197,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 
																 		int j;
															
 
																 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
															
 
																 		{
															
 
																+			hypervisor.sched_ctx_w[i].start_time_w[i] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[i].idle_time[j] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[i].idle_start_time[j] = 0.0;
															
@@ -208,6 +210,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 
																 			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
															
 
																 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
															
 
																+			hypervisor.sched_ctx_w[i].compute_idle[j] = 1;
															
 
																 		}
															
 
																 	}
															
@@ -472,17 +475,6 @@ double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_
 
																 	return ret_val;
															
 
																 }
															
 
																-static void _reset_idle_time(unsigned sched_ctx)
															
 
																-{
															
 
																-	int i;
															
 
																-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																-	{
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].idle_time[i] = 0.0;
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] = hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] != 0.0 ? starpu_timing_now() : 0.0;
															
 
																-	}
															
 
																-	return;
															
 
																-}
															
 
																-
															
 
																 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
															
 
																 {
															
 
																 	double start_time =  starpu_timing_now();
															
@@ -493,32 +485,32 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
																 		sender_sc_w->start_time = start_time;
															
 
																 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
															
 
																-		_reset_idle_time(sender_sched_ctx);
															
 
																 		int i;
															
 
																 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																 		{
															
 
																+			sender_sc_w->start_time_w[i] = start_time;
															
 
																+			sender_sc_w->idle_time[i] = 0.0;
															
 
																 			sender_sc_w->idle_start_time[i] = 0.0;
															
 
																-			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[sender_sched_ctx].exec_time[i] = 0.0;
															
 
																+			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = (hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
															
 
																 		}
															
 
																 	}
															
 
																 	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 	{
															
 
																-
															
 
																 		struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
															
 
																 		receiver_sc_w->start_time = start_time;
															
 
																 		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
															
 
																-		_reset_idle_time(receiver_sched_ctx);
															
 
																 		int i;
															
 
																 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																 		{
															
 
																-			receiver_sc_w->idle_start_time[i] = (hypervisor.sched_ctx_w[receiver_sched_ctx].idle_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
															
 
																-			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = 0.0;
															
 
																+			receiver_sc_w->start_time_w[i] = (receiver_sc_w->start_time_w[i] != 0.0) ? starpu_timing_now() : 0.0;
															
 
																+			receiver_sc_w->idle_time[i] = 0.0;
															
 
																+			receiver_sc_w->idle_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? 0.0 : starpu_timing_now();
															
 
																+			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
															
 
																 			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_time[i] = 0.0;
															
 
																-
															
 
																 		}
															
 
																 	}
															
@@ -801,6 +793,11 @@ void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *work
 
																 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
															
 
																 }
															
 
																+void _sc_hypervisor_allow_compute_idle(unsigned sched_ctx, int worker, unsigned allow)
															
 
																+{
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].compute_idle[worker] = allow;
															
 
																+}
															
 
																+
															
 
																 void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
															
 
																 {
															
 
																 	unsigned sched_ctx;
															
@@ -823,32 +820,62 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 
																 		if(workers->init_iterator)
															
 
																 			workers->init_iterator(workers, &it);
															
 
																-		max_workers_idle_time[i] = 0.0;
															
 
																-		int nshared_workers = 0;
															
 
																-		double cpu_used_in_shared = 0.0;
															
 
																-		double exec_time = 0.0;
															
 
																+		double elapsed_time_worker[STARPU_NMAXWORKERS];
															
 
																+		double norm_idle_time = 0.0;
															
 
																+		double end_time  = starpu_timing_now();
															
 
																 		while(workers->has_next(workers, &it))
															
 
																 		{
															
 
																+			double idle_time = 0.0;
															
 
																 			worker = workers->get_next(workers, &it);
															
 
																-			if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]==0.0)
															
 
																+			if(hypervisor.sched_ctx_w[sched_ctx].compute_idle[worker])
															
 
																+			{
															
 
																+				if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
															
 
																+					elapsed_time_worker[worker] = 0.0;
															
 
																+				else
															
 
																+					elapsed_time_worker[worker] = (end_time - hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker]) / 1000000.0;
															
 
																+				
															
 
																+				if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker] == 0.0)
															
 
																+				{
															
 
																+					idle_time = hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																+					idle_time = hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
															
 
																+				}		
															
 
																+				norm_idle_time += (elapsed_time_worker[worker] == 0.0 ? 0.0 : (idle_time / elapsed_time_worker[worker]));
															
 
																+/* 				printf("%d/%d: start time %lf elapsed time %lf idle time %lf norm_idle_time %lf \n",  */
															
 
																+/* 				       worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker], elapsed_time_worker[worker], idle_time, norm_idle_time); */
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		double norm_exec_time = 0.0;
															
 
																+		for(worker = 0; worker < STARPU_NMAXWORKERS; worker++)
															
 
																+		{
															
 
																+			double exec_time = 0.0;
															
 
																+			if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
															
 
																+				elapsed_time_worker[worker] = 0.0;
															
 
																+			else
															
 
																+				elapsed_time_worker[worker] = (end_time - hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker]) / 1000000.0;
															
 
																+
															
 
																+			if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] == 0.0)
															
 
																 			{
															
 
																-				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
															
 
																+				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker];
															
 
																+//				printf("%d/%d: exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker]);
															
 
																 			}
															
 
																 			else
															
 
																 			{
															
 
																-				double end_time  = starpu_timing_now();
															
 
																-				double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																-				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
															
 
																+				double current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																+				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] + current_exec_time;
															
 
																+//				printf("%d/%d: exec_time %lf current_exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker], current_exec_time);
															
 
																 			}		
															
 
																-			exec_time += hypervisor.sched_ctx_w[sched_ctx].exec_time[worker];
															
 
																+			norm_exec_time += elapsed_time_worker[worker] == 0.0 ? 0.0 : exec_time / elapsed_time_worker[worker];
															
 
																 		}			
															
 
																-		
															
 
																 		double curr_time = starpu_timing_now();
															
 
																 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
															
 
																-		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
															
 
																-		double norm_exec_time = exec_time / elapsed_time;
															
 
																-
															
 
																+//		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
															
 
																+//		double norm_exec_time = exec_time / elapsed_time;
															
 
																 		if(norm_idle_time >= 0.9)
															
 
																 		{
															
 
																 //			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time);
															
@@ -860,8 +887,8 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 
																 		{
															
 
																 			if(norm_idle_time < 0.1)//(max_workers_idle_time[i] < 0.000001)
															
 
																 				config->max_nworkers = lrint(norm_exec_time)  + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
															
 
																-/* 			else */
															
 
																-/* 				config->max_nworkers = workers->nworkers; */
															
 
																+			else
															
 
																+				config->max_nworkers = lrint(norm_exec_time);
															
 
																 		}
															
 
																 		if(config->max_nworkers < 0)
															
@@ -897,26 +924,31 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 
																 		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
															
 
																 	}
															
 
																 }
															
 
																-/* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
															
 
																-static void notify_poped_task(unsigned sched_ctx, int worker)
															
 
																+
															
 
																+/* notifies the hypervisor that a new task was pushed on the queue of the worker */
															
 
																+static void notify_pushed_task(unsigned sched_ctx, int worker)
															
 
																 {
															
 
																-	hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = starpu_timing_now();
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].pushed_tasks[worker]++;
															
 
																+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time == 0.0)
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
															
 
																-	if(hypervisor.resize[sched_ctx])
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
															
 
																+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
															
 
																-	struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
															
 
																+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
															
 
																+
															
 
																+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
															
 
																-	if(sc_w->idle_start_time[worker] != 0.0)
															
 
																+	if((hypervisor.min_tasks == 0 || (!(hypervisor.resize[sched_ctx] == 0 && imposed_resize) && ntasks == hypervisor.min_tasks)) && hypervisor.check_min_tasks[sched_ctx])
															
 
																 	{
															
 
																-		double end_time  = starpu_timing_now();
															
 
																-		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																-		sc_w->idle_start_time[worker] = 0.0;
															
 
																+		hypervisor.resize[sched_ctx] = 1;
															
 
																+		if(imposed_resize) imposed_resize = 0;
															
 
																+		hypervisor.check_min_tasks[sched_ctx] = 0;
															
 
																 	}
															
 
																-			
															
 
																-	if(hypervisor.policy.handle_idle_end)
															
 
																-		hypervisor.policy.handle_idle_end(sched_ctx, worker);
															
 
																+	if(hypervisor.policy.handle_pushed_task)
															
 
																+		hypervisor.policy.handle_pushed_task(sched_ctx, worker);
															
 
																 }
															
 
																 /* notifies the hypervisor that the worker spent another cycle in idle time */
															
@@ -938,36 +970,46 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 
																 	return;
															
 
																 }
															
 
																-/* notifies the hypervisor that a new task was pushed on the queue of the worker */
															
 
																-static void notify_pushed_task(unsigned sched_ctx, int worker)
															
 
																+/* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
															
 
																+static void notify_poped_task(unsigned sched_ctx, int worker)
															
 
																 {
															
 
																-	hypervisor.sched_ctx_w[sched_ctx].pushed_tasks[worker]++;
															
 
																-	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time == 0.0)
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
															
 
																+	if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
															
 
																-	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = starpu_timing_now();
															
 
																-	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
															
 
																+	if(hypervisor.resize[sched_ctx])
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
															
 
																-	if((hypervisor.min_tasks == 0 || (!(hypervisor.resize[sched_ctx] == 0 && imposed_resize) && ntasks == hypervisor.min_tasks)) && hypervisor.check_min_tasks[sched_ctx])
															
 
																+	struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
															
 
																+
															
 
																+	if(sc_w->idle_start_time[worker] > 0.0)
															
 
																 	{
															
 
																-		hypervisor.resize[sched_ctx] = 1;
															
 
																-		if(imposed_resize) imposed_resize = 0;
															
 
																-		hypervisor.check_min_tasks[sched_ctx] = 0;
															
 
																+		double end_time  = starpu_timing_now();
															
 
																+		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																+		sc_w->idle_start_time[worker] = 0.0;
															
 
																 	}
															
 
																+			
															
 
																+	if(hypervisor.policy.handle_idle_end)
															
 
																+		hypervisor.policy.handle_idle_end(sched_ctx, worker);
															
 
																-	if(hypervisor.policy.handle_pushed_task)
															
 
																-		hypervisor.policy.handle_pushed_task(sched_ctx, worker);
															
 
																 }
															
 
																-
															
 
																+ 
															
 
																 /* notifies the hypervisor that a tagged task has just been executed */
															
 
																-static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag)
															
 
																+static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag, int ready_tasks, double ready_flops)
															
 
																 {
															
 
																 	unsigned sched_ctx = task->sched_ctx;
															
 
																 	int worker = starpu_worker_get_id();
															
 
																+	if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] != 0.0)
															
 
																+	{
															
 
																+		double current_time = starpu_timing_now();
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += (current_time - 
															
 
																+									hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
															
 
																+		hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = 0.0;
															
 
																+	}
															
 
																+
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += task->flops;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
															
@@ -976,57 +1018,27 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 
																 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops;
															
 
																-	hypervisor.sched_ctx_w[sched_ctx].nready_tasks--;
															
 
																-	hypervisor.sched_ctx_w[sched_ctx].ready_flops -= task->flops;
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].nready_tasks = ready_tasks;
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].ready_flops = ready_flops;
															
 
																 	if(hypervisor.sched_ctx_w[sched_ctx].ready_flops < 0.0)
															
 
																 		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
															
 
																+	_ack_resize_completed(sched_ctx, worker);
															
 
																 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
															
 
																-/* 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx); */
															
 
																-	
															
 
																-/* 	unsigned finished_sample = 0; */
															
 
																-/* 	char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA"); */
															
 
																-/* 	if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0)) */
															
 
																-/* 	{ */
															
 
																-
															
 
																-/* 		double curr_time = starpu_timing_now(); */
															
 
																-/* 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
															
 
																-
															
 
																-/* 		finished_sample = elapsed_time > config->time_sample; */
															
 
																-/* 	} */
															
 
																-/* 	else */
															
 
																-/* 	{ */
															
 
																-/* 		double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]); */
															
 
																-/* 		double ctx_sample = config->ispeed_ctx_sample; */
															
 
																-
															
 
																-/* 		finished_sample = ctx_elapsed_flops > ctx_sample; */
															
 
																-/* 	} */
															
 
																-
															
 
																-/* 	if(finished_sample) */
															
 
																-/* 	{ */
															
 
																-/* 		sc_hypervisor_update_resize_interval(sched_ctx); */
															
 
																-/* 	} */
															
 
																 	if(hypervisor.resize[sched_ctx])
															
 
																 	{	
															
 
																 		if(hypervisor.policy.handle_poped_task)
															
 
																 			hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
															
 
																 	}
															
 
																-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
															
 
																-	_ack_resize_completed(sched_ctx, worker);
															
 
																-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
															
 
																+/* 	starpu_pthread_mutex_lock(&act_hypervisor_mutex); */
															
 
																+/* 	_ack_resize_completed(sched_ctx, worker); */
															
 
																+/* 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex); */
															
 
																 	if(hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker] % 200 == 0)
															
 
																 		_print_current_time();
															
 
																 	if(task_tag <= 0)
															
 
																-	{
															
 
																-		int workerid = starpu_worker_get_id();
															
 
																-		double current_time = starpu_timing_now();
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += current_time - 
															
 
																-			hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] / 1000000.0; /* in seconds */ 
															
 
																-		hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = 0.0;
															
 
																 		return; 
															
 
																-	}
															
 
																 	unsigned conf_sched_ctx;
															
 
																 	unsigned i;
															
--- a/sc_hypervisor/src/sc_hypervisor_intern.h
+++ b/sc_hypervisor/src/sc_hypervisor_intern.h
@@ -124,3 +124,5 @@ double _get_optimal_v(unsigned sched_ctx);
 
																 void _set_optimal_v(unsigned sched_ctx, double optimal_v);
															
 
																 int _sc_hypervisor_use_lazy_resize(void);
															
 
																+
															
 
																+void _sc_hypervisor_allow_compute_idle(unsigned sched_ctx, int worker, unsigned allow);
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -129,7 +129,6 @@ noinst_HEADERS = 						\
 
																 	profiling/profiling.h					\
															
 
																 	util/starpu_task_insert_utils.h				\
															
 
																 	util/starpu_data_cpy.h					\
															
 
																-	util/starpu_task_list_inline.h				\
															
 
																 	starpu_parameters.h					\
															
 
																 	top/starpu_top_message_queue.h				\
															
 
																 	top/starpu_top_connection.h				\
															
@@ -229,7 +228,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
																 	util/starpu_data_cpy.c					\
															
 
																 	util/starpu_task_insert.c				\
															
 
																 	util/starpu_task_insert_utils.c				\
															
 
																-	util/starpu_inlines.c					\
															
 
																 	debug/traces/starpu_fxt.c				\
															
 
																 	debug/traces/starpu_fxt_mpi.c				\
															
 
																 	debug/traces/starpu_fxt_dag.c				\
															
--- a/src/common/barrier.c
+++ b/src/common/barrier.c
@@ -23,6 +23,7 @@ int _starpu_barrier_init(struct _starpu_barrier *barrier, int count)
 
																 	barrier->count = count;
															
 
																 	barrier->reached_start = 0;
															
 
																 	barrier->reached_exit = 0;
															
 
																+	barrier->reached_flops = 0.0;
															
 
																 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex, NULL);
															
 
																 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex_exit, NULL);
															
 
																 	STARPU_PTHREAD_COND_INIT(&barrier->cond, NULL);
															
--- a/src/common/barrier.h
+++ b/src/common/barrier.h
@@ -29,6 +29,7 @@ struct _starpu_barrier
 
																 	int count;
															
 
																 	int reached_start;
															
 
																 	int reached_exit;
															
 
																+	double reached_flops;
															
 
																 	starpu_pthread_mutex_t mutex;
															
 
																 	starpu_pthread_mutex_t mutex_exit;
															
 
																 	starpu_pthread_cond_t cond;
															
--- a/src/common/barrier_counter.c
+++ b/src/common/barrier_counter.c
@@ -56,7 +56,7 @@ int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter
 
																 	return 0;
															
 
																 }
															
 
																-int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c)
															
 
																+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c, double flops)
															
 
																 {
															
 
																 	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
 
																 	int ret = 0;
															
@@ -64,6 +64,7 @@ int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier
 
																 	if (--barrier->reached_start == 0)
															
 
																 	{
															
 
																+		barrier->reached_flops -= flops;
															
 
																 		ret = 1;
															
 
																 		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
															
 
																 	}
															
@@ -72,7 +73,7 @@ int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier
 
																 	return ret;
															
 
																 }
															
 
																-int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c)
															
 
																+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c, double flops)
															
 
																 {
															
 
																 	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
 
																 	int ret = 0;
															
@@ -80,6 +81,7 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
																 	if(++barrier->reached_start == barrier->count)
															
 
																 	{
															
 
																+		barrier->reached_flops += flops;
															
 
																 		ret = 1;
															
 
																 		STARPU_PTHREAD_COND_BROADCAST(&barrier_c->cond2);
															
 
																 	}
															
@@ -88,14 +90,26 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
																 	return ret;
															
 
																 }
															
 
																-int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c)
															
 
																+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c, double flops)
															
 
																 {
															
 
																 	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
															
 
																 	barrier->reached_start++;
															
 
																+	barrier->reached_flops += flops;
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
															
 
																 	return 0;
															
 
																 }
															
 
																+int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c)
															
 
																+{
															
 
																+	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
															
 
																+
															
 
																+	if(barrier->reached_start == 0)
															
 
																+		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
															
 
																+	return 0;
															
 
																+}
															
--- a/src/common/barrier_counter.h
+++ b/src/common/barrier_counter.h
@@ -34,10 +34,12 @@ int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counte
 
																 int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c);
															
 
																-int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c);
															
 
																+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c, double flops);
															
 
																-int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c);
															
 
																+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c, double flops);
															
 
																-int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
															
 
																+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c, double flops);
															
 
																+
															
 
																+int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c);
															
 
																 #endif
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -137,6 +137,9 @@
 
																 #define _STARPU_FUT_DATA_LOAD 0x5153
															
 
																+#define _STARPU_FUT_START_UNPARTITION 0x5154
															
 
																+#define _STARPU_FUT_END_UNPARTITION 0x5155
															
 
																+
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 #include <fxt/fxt.h>
															
 
																 #include <fxt/fut.h>
															
@@ -564,6 +567,12 @@ do {										\
 
																 #define _STARPU_TRACE_MEMORY_FULL(size)	\
															
 
																 	FUT_DO_PROBE2(_STARPU_FUT_MEMORY_FULL,size,_starpu_gettid());
															
 
																+#define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
															
 
																+	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION, memnode, _starpu_gettid(), handle);
															
 
																+	
															
 
																+#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
															
 
																+	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION, memnode, _starpu_gettid(), handle);
															
 
																+
															
 
																 #else // !STARPU_USE_FXT
															
 
																 /* Dummy macros in case FxT is disabled */
															
@@ -629,6 +638,8 @@ do {										\
 
																 #define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
															
 
																 #define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
															
 
																 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
															
 
																+#define _STARPU_TRACE_START_UNPARTITION(handle, memnode)	do {} while(0)
															
 
																+#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		do {} while(0)
															
 
																 #endif // STARPU_USE_FXT
															
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -217,6 +217,16 @@ int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 
																 	return p_ret;
															
 
																 }
															
 
																+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
															
 
																+{
															
 
																+	int p_ret = starpu_pthread_mutex_trylock(rwlock);
															
 
																+
															
 
																+	if (!p_ret)
															
 
																+		_STARPU_TRACE_RWLOCK_RDLOCKED();
															
 
																+
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
															
 
																 {
															
 
																 	_STARPU_TRACE_WRLOCKING_RWLOCK();
															
@@ -228,6 +238,17 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 
																 	return p_ret;
															
 
																 }
															
 
																+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
															
 
																+{
															
 
																+	int p_ret =  starpu_pthread_mutex_trylock(rwlock);
															
 
																+
															
 
																+	if (!p_ret)
															
 
																+		_STARPU_TRACE_RWLOCK_RDLOCKED();
															
 
																+
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																+
															
 
																 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
															
 
																 {
															
 
																 	_STARPU_TRACE_UNLOCKING_RWLOCK();
															
@@ -298,6 +319,18 @@ int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 
																 	return p_ret;
															
 
																 }
															
 
																+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
															
 
																+{
															
 
																+	_STARPU_TRACE_RDLOCKING_RWLOCK();
															
 
																+
															
 
																+ 	int p_ret = pthread_rwlock_tryrdlock(rwlock);
															
 
																+
															
 
																+	if (!p_ret)
															
 
																+		_STARPU_TRACE_RWLOCK_RDLOCKED();
															
 
																+
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
															
 
																 {
															
 
																 	_STARPU_TRACE_WRLOCKING_RWLOCK();
															
@@ -309,6 +342,18 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 
																 	return p_ret;
															
 
																 }
															
 
																+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
															
 
																+{
															
 
																+	_STARPU_TRACE_WRLOCKING_RWLOCK();
															
 
																+
															
 
																+ 	int p_ret = pthread_rwlock_trywrlock(rwlock);
															
 
																+
															
 
																+	if (!p_ret)
															
 
																+		_STARPU_TRACE_RWLOCK_WRLOCKED();
															
 
																+
															
 
																+	return p_ret;
															
 
																+}
															
 
																+
															
 
																 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
															
 
																 {
															
 
																 	_STARPU_TRACE_UNLOCKING_RWLOCK();
															
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -159,3 +159,17 @@ void _starpu_gethostname(char *hostname, size_t size)
 
																 			*c = 0;
															
 
																 	}
															
 
																 }
															
 
																+
															
 
																+void _starpu_sleep(struct timespec ts)
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_WINDOWS
															
 
																+	Sleep((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000));
															
 
																+#else
															
 
																+	struct timespec req, rem;
															
 
																+
															
 
																+	req = ts;
															
 
																+	while (nanosleep(&req, &rem))
															
 
																+		req = rem;
															
 
																+#endif
															
 
																+}
															
 
																+
															
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -118,4 +118,6 @@ const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl);
 
																 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex);
															
 
																+void _starpu_sleep(struct timespec ts);
															
 
																+
															
 
																 #endif // __COMMON_UTILS_H__
															
--- a/src/core/debug.h
+++ b/src/core/debug.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -44,4 +44,7 @@ extern int _starpu_use_fxt;
 
																 /* Get an Ayudame id for CL */
															
 
																 int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl);
															
 
																+void _starpu_watchdog_init(void);
															
 
																+void _starpu_watchdog_shutdown(void);
															
 
																+
															
 
																 #endif // __DEBUG_H__
															
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -288,7 +288,7 @@ void starpu_tag_restart(starpu_tag_t id)
 
																 	struct _starpu_tag *tag = gettag_struct(id);
															
 
																 	_starpu_spin_lock(&tag->lock);
															
 
																-	STARPU_ASSERT_MSG(tag->state == STARPU_DONE, "Only completed tags can be restarted (%llu was %d)", (unsigned long long) id, tag->state);
															
 
																+	STARPU_ASSERT_MSG(tag->state == STARPU_DONE || tag->state == STARPU_INVALID_STATE || tag->state == STARPU_ASSOCIATED || tag->state == STARPU_BLOCKED, "Only completed tags can be restarted (%llu was %d)", (unsigned long long) id, tag->state);
															
 
																 	tag->state = STARPU_BLOCKED;
															
 
																 	_starpu_spin_unlock(&tag->lock);
															
 
																 }
															
@@ -313,8 +313,9 @@ void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job)
 
																 	 * detect when either of them are finished. We however don't allow
															
 
																 	 * several tasks to share a tag when it is used to wake them by
															
 
																 	 * dependency */
															
 
																+	if (tag->job != job)
															
 
																+		tag->is_assigned++;
															
 
																 	tag->job = job;
															
 
																-	tag->is_assigned++;
															
 
																 	job->tag = tag;
															
 
																 	/* the tag is now associated to a job */
															
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -39,13 +39,17 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 
																 	}
															
 
																 	/* Got to a PU leaf */
															
 
																-	struct _starpu_worker *worker = obj->userdata;
															
 
																-	/* is it a CPU worker? */
															
 
																-	if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
															
 
																+	struct _starpu_worker_list *workers = obj->userdata;
															
 
																+	struct _starpu_worker *worker;
															
 
																+	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
															
 
																 	{
															
 
																-		_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
															
 
																-		/* Add it to the combined worker */
															
 
																-		cpu_workers[(*n)++] = worker->workerid;
															
 
																+		/* is it a CPU worker? */
															
 
																+		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
															
 
																+		{
															
 
																+			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
															
 
																+			/* Add it to the combined worker */
															
 
																+			cpu_workers[(*n)++] = worker->workerid;
															
 
																+		}
															
 
																 	}
															
 
																 }
															
@@ -177,7 +181,6 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
																 		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
															
 
																 		{
															
 
																 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
															
 
																-			STARPU_ASSERT(obj->userdata == worker);
															
 
																 			obj = obj->parent;
															
 
																 			while (obj)
															
 
																 			{
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -143,6 +143,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 {
															
 
																 	struct starpu_task *task = j->task;
															
 
																 	unsigned sched_ctx = task->sched_ctx;
															
 
																+	double flops = task->flops;
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
															
 
																 	task->status = STARPU_TASK_FINISHED;
															
@@ -294,10 +295,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 		int ret = _starpu_submit_job(j);
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 	}
															
 
																-	_starpu_decrement_nsubmitted_tasks();
															
 
																-	_starpu_decrement_nready_tasks();
															
 
																 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
															
 
																+	_starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx, flops);
															
 
																 	struct _starpu_worker *worker;
															
 
																 	worker = _starpu_get_local_worker_key();
															
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -63,6 +63,7 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 
																 void _starpu_load_perfmodel(struct starpu_perfmodel *model);
															
 
																 void _starpu_initialize_registered_performance_models(void);
															
 
																 void _starpu_deinitialize_registered_performance_models(void);
															
 
																+void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
															
 
																 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
															
 
																 					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
															
@@ -90,6 +91,7 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 
																 					    double latency_write, double latency_read, unsigned node);
															
 
																 int _starpu_read_double(FILE *f, char *format, double *val);
															
 
																+void _starpu_simgrid_get_platform_path(char *path, size_t maxlen);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -238,7 +238,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	 * since we cleanly shutdown CUDA before returning. */
															
 
																 	cudaSetDevice(src);
															
 
																-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
															
 
																+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
															
 
																 	{
															
 
																 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
															
 
																 		if (!cures && can)
															
@@ -260,7 +260,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	 * since we cleanly shutdown CUDA before returning. */
															
 
																 	cudaSetDevice(dst);
															
 
																-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
															
 
																+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
															
 
																 	{
															
 
																 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
															
 
																 		if (!cures && can)
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1014,12 +1014,6 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 
																 		else
															
 
																 		{
															
 
																 			_STARPU_DEBUG("File does not exists\n");
															
 
																-			if (!calibrate_flag)
															
 
																-			{
															
 
																-				_STARPU_DISP("Warning: model %s is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol);
															
 
																-				_starpu_set_calibrate_flag(1);
															
 
																-				model->benchmarking = 1;
															
 
																-			}
															
 
																 		}
															
 
																 		_STARPU_DEBUG("Performance model file %s for model %s is loaded\n", path, model->symbol);
															
--- a/src/core/perfmodel/perfmodel_nan.c
+++ b/src/core/perfmodel/perfmodel_nan.c
@@ -22,6 +22,7 @@
 
																 #include <math.h>
															
 
																 #include <string.h>
															
 
																 #include <config.h>
															
 
																+#include <core/perfmodel/perfmodel.h>
															
 
																 int _starpu_read_double(FILE *f, char *format, double *val)
															
 
																 {
															
--- a/src/core/progress_hook.c
+++ b/src/core/progress_hook.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -41,6 +41,7 @@ static int active_hook_cnt = 0;
 
																 void _starpu_init_progression_hooks(void)
															
 
																 {
															
 
																 	STARPU_PTHREAD_RWLOCK_INIT(&progression_hook_rwlock, NULL);
															
 
																+	STARPU_HG_DISABLE_CHECKING(active_hook_cnt);
															
 
																 }
															
 
																 int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
															
@@ -85,12 +86,7 @@ void starpu_progression_hook_deregister(int hook_id)
 
																 unsigned _starpu_execute_registered_progression_hooks(void)
															
 
																 {
															
 
																-	/* If there is no hook registered, we short-cut loop. */
															
 
																-	STARPU_PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
															
 
																-	int no_hook = (active_hook_cnt == 0);
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
															
 
																-
															
 
																-	if (no_hook)
															
 
																+	if (active_hook_cnt == 0)
															
 
																 		return 1;
															
 
																 	/* By default, it is possible to block, but if some progression hooks
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -292,7 +292,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
																 	sem_init(&sched_ctx->parallel_code_sem, 0, 0);
															
 
																 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
															
 
																+	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
															
 
																+	sched_ctx->ready_flops = 0.0;
															
 
																 	/*init the strategy structs and the worker_collection of the ressources of the context */
															
 
																 	_starpu_init_sched_policy(config, sched_ctx, policy);
															
@@ -630,6 +632,7 @@ void _starpu_delete_all_sched_ctxs()
 
																 		{
															
 
																 			_starpu_sched_ctx_free_scheduling_data(sched_ctx);
															
 
																 			_starpu_barrier_counter_destroy(&sched_ctx->tasks_barrier);
															
 
																+			_starpu_barrier_counter_destroy(&sched_ctx->ready_tasks_barrier);
															
 
																 			_starpu_delete_sched_ctx(sched_ctx);
															
 
																 		}
															
 
																 		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[i]);
															
@@ -748,6 +751,28 @@ void starpu_sched_ctx_remove_workers(int *workers_to_remove, int nworkers_to_rem
 
																 	return;
															
 
																 }
															
 
																+int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
															
 
																+{
															
 
																+	unsigned worker = 0, nworkers = 0;
															
 
																+	STARPU_PTHREAD_RWLOCK_WRLOCK(&changing_ctx_mutex[sched_ctx->id]);
															
 
																+	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																+
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																+	if(workers->init_iterator)
															
 
																+		workers->init_iterator(workers, &it);
															
 
																+
															
 
																+	while(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		worker = workers->get_next(workers, &it);
															
 
																+		STARPU_ASSERT_MSG(worker < STARPU_NMAXWORKERS, "worker id %d", worker);
															
 
																+		if (starpu_worker_can_execute_task(worker, task, 0))
															
 
																+			nworkers++;
															
 
																+	}
															
 
																+	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
															
 
																+
															
 
																+	return nworkers;
															
 
																+}
															
 
																+
															
 
																 /* unused sched_ctx have the id STARPU_NMAX_SCHED_CTXS */
															
 
																 void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
															
 
																 {
															
@@ -786,8 +811,12 @@ int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																 {
															
 
																+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+	if (!config->watchdog_ok)
															
 
																+		config->watchdog_ok = 1;
															
 
																+
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
															
 
																+	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier, 0.0);
															
 
																         /* when finished decrementing the tasks if the user signaled he will not submit tasks anymore
															
 
																            we can move all its workers to the inheritor context */
															
 
																 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
															
@@ -818,14 +847,67 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 			return;
															
 
																 		}
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
															
 
																+		/* FIXME: */
															
 
																+		/* We also need to check for config->submitting = 0 (i.e. the
															
 
																+		 * user calle starpu_drivers_request_termination()), in which
															
 
																+		 * case we need to set config->running to 0 and wake workers,
															
 
																+		 * so they can terminate, just like
															
 
																+		 * starpu_drivers_request_termination() does.
															
 
																+		 *
															
 
																+		 * Set FIXME to 1 in tests/main/driver_api/run_driver.c to
															
 
																+		 * check it is actually fixed.
															
 
																+		 */
															
 
																 	}
															
 
																+
															
 
																 	return;
															
 
																 }
															
 
																 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
															
 
																+	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier, 0.0);
															
 
																+}
															
 
																+
															
 
																+int _starpu_get_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return sched_ctx->tasks_barrier.barrier.reached_start;
															
 
																+}
															
 
																+
															
 
																+int _starpu_check_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return _starpu_barrier_counter_check(&sched_ctx->tasks_barrier);
															
 
																+}
															
 
																+
															
 
																+void _starpu_increment_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	_starpu_barrier_counter_increment(&sched_ctx->ready_tasks_barrier, ready_flops);
															
 
																+}
															
 
																+
															
 
																+void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	_starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->ready_tasks_barrier, ready_flops);
															
 
																+}
															
 
																+
															
 
																+int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return sched_ctx->ready_tasks_barrier.barrier.reached_start;
															
 
																+}
															
 
																+
															
 
																+double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return sched_ctx->ready_tasks_barrier.barrier.reached_flops;
															
 
																+}
															
 
																+
															
 
																+int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return _starpu_barrier_counter_wait_for_empty_counter(&sched_ctx->ready_tasks_barrier);
															
 
																 }
															
 
																 void starpu_sched_ctx_set_context(unsigned *sched_ctx)
															
@@ -842,6 +924,15 @@ unsigned starpu_sched_ctx_get_context()
 
																 	return *sched_ctx;
															
 
																 }
															
 
																+unsigned _starpu_sched_ctx_get_current_context()
															
 
																+{
															
 
																+	unsigned sched_ctx = starpu_sched_ctx_get_context();
															
 
																+	if (sched_ctx == STARPU_NMAX_SCHED_CTXS)
															
 
																+		return _starpu_get_initial_sched_ctx()->id;
															
 
																+	else
															
 
																+		return sched_ctx;
															
 
																+}
															
 
																+
															
 
																 void starpu_sched_ctx_notify_hypervisor_exists()
															
 
																 {
															
 
																 	with_hypervisor = 1;
															
@@ -1090,7 +1181,9 @@ void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task,
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
															
 
																 	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
															
 
																 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
															
 
																-		sched_ctx->perf_counters->notify_post_exec_task(task, data_size, footprint, task->hypervisor_tag);
															
 
																+		sched_ctx->perf_counters->notify_post_exec_task(task, data_size, footprint, task->hypervisor_tag, 
															
 
																+								_starpu_get_nready_tasks_of_sched_ctx(sched_ctx->id), 
															
 
																+								_starpu_get_nready_flops_of_sched_ctx(sched_ctx->id));
															
 
																 }
															
 
																 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
															
@@ -1105,22 +1198,22 @@ void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 
																 int starpu_sched_get_min_priority(void)
															
 
																 {
															
 
																-	return starpu_sched_ctx_get_min_priority(_starpu_get_initial_sched_ctx()->id);
															
 
																+	return starpu_sched_ctx_get_min_priority(_starpu_sched_ctx_get_current_context());
															
 
																 }
															
 
																 int starpu_sched_get_max_priority(void)
															
 
																 {
															
 
																-	return starpu_sched_ctx_get_max_priority(_starpu_get_initial_sched_ctx()->id);
															
 
																+	return starpu_sched_ctx_get_max_priority(_starpu_sched_ctx_get_current_context());
															
 
																 }
															
 
																 int starpu_sched_set_min_priority(int min_prio)
															
 
																 {
															
 
																-	return starpu_sched_ctx_set_min_priority(_starpu_get_initial_sched_ctx()->id, min_prio);
															
 
																+	return starpu_sched_ctx_set_min_priority(_starpu_sched_ctx_get_current_context(), min_prio);
															
 
																 }
															
 
																 int starpu_sched_set_max_priority(int max_prio)
															
 
																 {
															
 
																-	return starpu_sched_ctx_set_max_priority(_starpu_get_initial_sched_ctx()->id, max_prio);
															
 
																+	return starpu_sched_ctx_set_max_priority(_starpu_sched_ctx_get_current_context(), max_prio);
															
 
																 }
															
 
																 int starpu_sched_ctx_get_min_priority(unsigned sched_ctx_id)
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -59,6 +59,12 @@ struct _starpu_sched_ctx
 
																 	/* wait for the tasks submitted to the context to be executed */
															
 
																 	struct _starpu_barrier_counter tasks_barrier;
															
 
																+	/* wait for the tasks ready of the context to be executed */
															
 
																+	struct _starpu_barrier_counter ready_tasks_barrier;
															
 
																+
															
 
																+	/* amount of ready flops in a context */
															
 
																+	double ready_flops;
															
 
																+
															
 
																 	/* cond to block push when there are no workers in the ctx */
															
 
																 	starpu_pthread_cond_t no_workers_cond;
															
@@ -141,6 +147,14 @@ int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id);
 
																  * task currently submitted to the context */
															
 
																 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+int _starpu_get_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+int _starpu_check_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+
															
 
																+void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
															
 
																+void _starpu_increment_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
															
 
																+int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id);
															
 
																 /* Return the corresponding index of the workerid in the ctx table */
															
 
																 int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
															
@@ -175,6 +189,15 @@ void _starpu_sched_ctx_rebind_thread_to_its_cpu(unsigned cpuid);
 
																 /* let the appl know that the worker blocked to execute parallel code */
															
 
																 void _starpu_sched_ctx_signal_worker_blocked(int workerid);
															
 
																+/* If starpu_sched_ctx_set_context() has been called, returns the context
															
 
																+ * id set by its last call, or the id of the initial context */
															
 
																+unsigned _starpu_sched_ctx_get_current_context();
															
 
																+
															
 
																+/* verify how many workers can execute a certain task */
															
 
																+int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx);
															
 
																+
															
 
																+void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx);
															
 
																+
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
 
																 /* Notifies the hypervisor that a tasks was poped from the workers' list */
															
 
																 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -299,25 +299,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
																 	}
															
 
																 }
															
 
																-static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
															
 
																-{
															
 
																-	unsigned worker = 0, nworkers = 0;
															
 
																-	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																-
															
 
																-	struct starpu_sched_ctx_iterator it;
															
 
																-	if(workers->init_iterator)
															
 
																-		workers->init_iterator(workers, &it);
															
 
																-
															
 
																-	while(workers->has_next(workers, &it))
															
 
																-	{
															
 
																-		worker = workers->get_next(workers, &it);
															
 
																-		if (starpu_worker_can_execute_task(worker, task, 0))
															
 
																-			nworkers++;
															
 
																-	}
															
 
																-
															
 
																-	return nworkers;
															
 
																-}
															
 
																-
															
 
																 /* the generic interface that call the proper underlying implementation */
															
 
																 int _starpu_push_task(struct _starpu_job *j)
															
@@ -334,13 +315,8 @@ int _starpu_push_task(struct _starpu_job *j)
 
																 	_STARPU_LOG_IN();
															
 
																 	_STARPU_TRACE_JOB_PUSH(task, task->priority > 0);
															
 
																-	_starpu_increment_nready_tasks();
															
 
																+	_starpu_increment_nready_tasks_of_sched_ctx(task->sched_ctx, task->flops);
															
 
																 	task->status = STARPU_TASK_READY;
															
 
																-#ifdef STARPU_USE_SC_HYPERVISOR
															
 
																-	if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL 
															
 
																-	   && sched_ctx->perf_counters->notify_ready_task)
															
 
																-		sched_ctx->perf_counters->notify_ready_task(sched_ctx->id, task);
															
 
																-#endif //STARPU_USE_SC_HYPERVISOR
															
 
																 #ifdef HAVE_AYUDAME_H
															
 
																 	if (AYU_event)
															
@@ -587,6 +563,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
																 	return conversion_task;
															
 
																 }
															
 
																+static
															
 
																 struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
															
 
																 {	
															
 
																 	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
															
@@ -595,8 +572,8 @@ struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker
 
																 	for (l = worker->sched_ctx_list; l; l = l->next)
															
 
																 	{
															
 
																 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
															
 
																-		if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1)
															
 
																-			return sched_ctx;
															
 
																+/* 		if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) */
															
 
																+/* 			return sched_ctx; */
															
 
																 		if(sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
															
 
																 		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
															
 
																 		{
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -31,15 +31,17 @@
 
																 #include <math.h>
															
 
																 #include <string.h>
															
 
																 #include <core/debug.h>
															
 
																+#include <core/sched_ctx.h>
															
 
																+#include <time.h>
															
 
																+#ifdef STARPU_HAVE_WINDOWS
															
 
																+#include <windows.h>
															
 
																+#endif
															
 
																 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
															
 
																  * sure that no task remains !) */
															
 
																 /* TODO we could make this hierarchical to avoid contention ? */
															
 
																-static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
															
 
																+//static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
															
 
																 static starpu_pthread_mutex_t submitted_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																-static long int nsubmitted = 0, nready = 0;
															
 
																-
															
 
																-static void _starpu_increment_nsubmitted_tasks(void);
															
 
																 /* This key stores the task currently handled by the thread, note that we
															
 
																  * cannot use the worker structure to store that information because it is
															
@@ -74,7 +76,7 @@ void starpu_task_init(struct starpu_task *task)
 
																 	task->predicted_transfer = NAN;
															
 
																 	task->magic = 42;
															
 
																-	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
															
 
																+	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
															
 
																 	task->flops = 0.0;
															
@@ -236,7 +238,6 @@ int _starpu_submit_job(struct _starpu_job *j)
 
																 	/* notify bound computation of a new task */
															
 
																 	_starpu_bound_record(j);
															
 
																-	_starpu_increment_nsubmitted_tasks();
															
 
																 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
@@ -411,19 +412,21 @@ int starpu_task_submit(struct starpu_task *task)
 
																 	int ret;
															
 
																 	unsigned is_sync = task->synchronous;
															
 
																 	starpu_task_bundle_t bundle = task->bundle;
															
 
																-	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
															
 
																-	unsigned set_sched_ctx = STARPU_NMAX_SCHED_CTXS;
															
 
																 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
															
 
																 	* task structure, it is possible that this job structure was already
															
 
																 	* allocated. */
															
 
																 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																-	if (task->sched_ctx == _starpu_get_initial_sched_ctx()->id  && nsched_ctxs != 1 && !j->internal)
															
 
																+	if (j->internal)
															
 
																 	{
															
 
																-		set_sched_ctx = starpu_sched_ctx_get_context();
															
 
																-		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																-			task->sched_ctx = set_sched_ctx;
															
 
																+		// Internal tasks are submitted to initial context
															
 
																+		task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
															
 
																+	}
															
 
																+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
															
 
																+	{
															
 
																+		// If the task has not specified a context, we set the current context
															
 
																+		task->sched_ctx = _starpu_sched_ctx_get_current_context();
															
 
																 	}
															
 
																 	if (is_sync)
															
@@ -575,7 +578,18 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 
																 	}
															
 
																 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																-	_starpu_increment_nsubmitted_tasks();
															
 
																+
															
 
																+	if (j->internal)
															
 
																+	{
															
 
																+		// Internal tasks are submitted to initial context
															
 
																+		j->task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
															
 
																+	}
															
 
																+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
															
 
																+	{
															
 
																+		// If the task has not specified a context, we set the current context
															
 
																+		j->task->sched_ctx = _starpu_sched_ctx_get_current_context();
															
 
																+	}
															
 
																+
															
 
																 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
															
@@ -629,11 +643,22 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
																 	}
															
 
																 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																-	_starpu_increment_nsubmitted_tasks();
															
 
																+
															
 
																+	if (j->internal)
															
 
																+	{
															
 
																+		// Internal tasks are submitted to initial context
															
 
																+		j->task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
															
 
																+	}
															
 
																+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
															
 
																+	{
															
 
																+		// If the task has not specified a context, we set the current context
															
 
																+		j->task->sched_ctx = _starpu_sched_ctx_get_current_context();
															
 
																+	}
															
 
																+
															
 
																 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
															
 
																 	j->submitted = 1;
															
 
																-	_starpu_increment_nready_tasks();
															
 
																+	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops);
															
 
																 	for (i=0 ; i<task->cl->nbuffers ; i++)
															
 
																 	{
															
@@ -711,38 +736,40 @@ int starpu_task_wait_for_all(void)
 
																 		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
															
 
																 			return -EDEADLK;
															
 
																-		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-		_STARPU_TRACE_TASK_WAIT_FOR_ALL;
															
 
																-
															
 
																-		while (nsubmitted > 0)
															
 
																-			STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
															
 
																-
															
 
																-		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-
															
 
																 #ifdef HAVE_AYUDAME_H
															
 
																 		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
															
 
																 #endif
															
 
																+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+		if(config->topology.nsched_ctxs == 1)
															
 
																+			starpu_task_wait_for_all_in_ctx(0);
															
 
																+		else
															
 
																+		{
															
 
																+			int s;
															
 
																+			for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																+			{
															
 
																+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+				{
															
 
																+					starpu_task_wait_for_all_in_ctx(config->sched_ctxs[s].id);
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		return 0;
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
															
 
																-		_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id);
															
 
																-#ifdef HAVE_AYUDAME_H
															
 
																-		/* TODO: improve Temanejo into knowing about contexts ... */
															
 
																-		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
															
 
																-#endif
															
 
																+		return starpu_task_wait_for_all_in_ctx(sched_ctx_id);
															
 
																 	}
															
 
																-	return 0;
															
 
																 }
															
 
																 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
															
 
																 {
															
 
																 	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
															
 
																 #ifdef HAVE_AYUDAME_H
															
 
																+	/* TODO: improve Temanejo into knowing about contexts ... */
															
 
																 	if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
															
 
																 #endif
															
 
																-
															
 
																 	return 0;
															
 
																 }
															
 
																 /*
															
@@ -754,39 +781,22 @@ int starpu_task_wait_for_no_ready(void)
 
																 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
															
 
																 		return -EDEADLK;
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-	_STARPU_TRACE_TASK_WAIT_FOR_ALL;
															
 
																-
															
 
																-	while (nready > 0)
															
 
																-		STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-void _starpu_decrement_nsubmitted_tasks(void)
															
 
																-{
															
 
																-	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-	if (--nsubmitted == 0)
															
 
																+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+	if(config->topology.nsched_ctxs == 1)
															
 
																+		_starpu_wait_for_no_ready_of_sched_ctx(0);
															
 
																+	else
															
 
																 	{
															
 
																-		if (!config->submitting)
															
 
																+		int s;
															
 
																+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																 		{
															
 
																-			ANNOTATE_HAPPENS_AFTER(&config->running);
															
 
																-			config->running = 0;
															
 
																-			ANNOTATE_HAPPENS_BEFORE(&config->running);
															
 
																+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+			{
															
 
																+				_starpu_wait_for_no_ready_of_sched_ctx(config->sched_ctxs[s].id);
															
 
																+			}
															
 
																 		}
															
 
																-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																 	}
															
 
																-	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-
															
 
																+	return 0;
															
 
																 }
															
 
																 void
															
@@ -795,57 +805,65 @@ starpu_drivers_request_termination(void)
 
																 	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																+	int nsubmitted = starpu_task_nsubmitted();
															
 
																 	config->submitting = 0;
															
 
																 	if (nsubmitted == 0)
															
 
																 	{
															
 
																 		ANNOTATE_HAPPENS_AFTER(&config->running);
															
 
																 		config->running = 0;
															
 
																 		ANNOTATE_HAPPENS_BEFORE(&config->running);
															
 
																-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																+		int s;
															
 
																+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																+		{
															
 
																+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+			{
															
 
																+				_starpu_check_nsubmitted_tasks_of_sched_ctx(config->sched_ctxs[s].id);
															
 
																+			}
															
 
																+		}
															
 
																 	}
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																 }
															
 
																-static void _starpu_increment_nsubmitted_tasks(void)
															
 
																-{
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-	nsubmitted++;
															
 
																-
															
 
																-	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-}
															
 
																-
															
 
																 int starpu_task_nsubmitted(void)
															
 
																 {
															
 
																+	int nsubmitted = 0;
															
 
																+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+	if(config->topology.nsched_ctxs == 1)
															
 
																+		nsubmitted = _starpu_get_nsubmitted_tasks_of_sched_ctx(0);
															
 
																+	else
															
 
																+	{
															
 
																+		int s;
															
 
																+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																+		{
															
 
																+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+			{
															
 
																+				nsubmitted += _starpu_get_nsubmitted_tasks_of_sched_ctx(config->sched_ctxs[s].id);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																 	return nsubmitted;
															
 
																 }
															
 
																-void _starpu_increment_nready_tasks(void)
															
 
																-{
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-	nready++;
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-}
															
 
																-
															
 
																-void _starpu_decrement_nready_tasks(void)
															
 
																-{
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-
															
 
																-	if (--nready == 0)
															
 
																-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																-
															
 
																-}
															
 
																 int starpu_task_nready(void)
															
 
																 {
															
 
																+	int nready = 0;
															
 
																+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+	if(config->topology.nsched_ctxs == 1)
															
 
																+		nready = _starpu_get_nready_tasks_of_sched_ctx(0);
															
 
																+	else
															
 
																+	{
															
 
																+		int s;
															
 
																+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																+		{
															
 
																+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+			{
															
 
																+				nready += _starpu_get_nready_tasks_of_sched_ctx(config->sched_ctxs[s].id);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																 	return nready;
															
 
																 }
															
@@ -982,3 +1000,57 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 
																 {
															
 
																 	return cl->cpu_funcs_name[nimpl];
															
 
																 }
															
 
																+
															
 
																+static starpu_pthread_t watchdog_thread;
															
 
																+
															
 
																+/* Check from times to times that StarPU does finish some tasks */
															
 
																+static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+	struct timespec ts;
															
 
																+	char *timeout_env;
															
 
																+	unsigned long long timeout;
															
 
																+
															
 
																+	if (! (timeout_env = getenv("STARPU_WATCHDOG_TIMEOUT")))
															
 
																+		return NULL;
															
 
																+
															
 
																+	timeout = atoll(timeout_env);
															
 
																+	ts.tv_sec = timeout / 1000000;
															
 
																+	ts.tv_nsec = (timeout % 1000000) * 1000;
															
 
																+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+	
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																+	while (_starpu_machine_is_running())
															
 
																+	{
															
 
																+		int last_nsubmitted = starpu_task_nsubmitted();
															
 
																+		config->watchdog_ok = 0;
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																+
															
 
																+		_starpu_sleep(ts);
															
 
																+
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																+		if (!config->watchdog_ok && last_nsubmitted
															
 
																+				&& last_nsubmitted == starpu_task_nsubmitted())
															
 
																+		{
															
 
																+			fprintf(stderr,"The StarPU watchdog detected that no task finished for %u.%06us (can be configure through STARPU_WATCHDOG_TIMEOUT)\n", (unsigned)ts.tv_sec, (unsigned)ts.tv_nsec/1000);
															
 
																+			if (getenv("STARPU_WATCHDOG_CRASH"))
															
 
																+			{
															
 
																+				fprintf(stderr,"Crashing the process\n");
															
 
																+				assert(0);
															
 
																+			}
															
 
																+			else
															
 
																+				fprintf(stderr,"Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
															
 
																+		}
															
 
																+	}
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																+	return NULL;
															
 
																+}
															
 
																+
															
 
																+void _starpu_watchdog_init(void)
															
 
																+{
															
 
																+	STARPU_PTHREAD_CREATE(&watchdog_thread, NULL, watchdog_func, NULL);
															
 
																+}
															
 
																+
															
 
																+void _starpu_watchdog_shutdown(void)
															
 
																+{
															
 
																+	starpu_pthread_join(watchdog_thread, NULL);
															
 
																+}
															
--- a/src/core/task.h
+++ b/src/core/task.h
@@ -26,14 +26,6 @@
 
																 /* Internal version of starpu_task_destroy: don't check task->destroy flag */
															
 
																 void _starpu_task_destroy(struct starpu_task *task);
															
 
																-/* In order to implement starpu_task_wait_for_all, we keep track of the number of
															
 
																- * task currently submitted */
															
 
																-void _starpu_decrement_nsubmitted_tasks(void);
															
 
																-/* In order to implement starpu_task_wait_for_no_ready, we keep track of the number of
															
 
																- * task currently ready */
															
 
																-void _starpu_increment_nready_tasks(void);
															
 
																-void _starpu_decrement_nready_tasks(void);
															
 
																-
															
 
																 /* A pthread key is used to store the task currently executed on the thread.
															
 
																  * _starpu_initialize_current_task_key initializes this pthread key and
															
 
																  * _starpu_set_current_task updates its current value. */
															
@@ -80,4 +72,7 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 
																 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
															
 
																 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
															
 
																+void _starpu_watchdog_init(void);
															
 
																+void _starpu_watchdog_shutdown(void);
															
 
																+
															
 
																 #endif // __CORE_TASK_H__
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -943,9 +943,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
																 		if (ncpu == -1)
															
 
																 		{
															
 
																 			unsigned mic_busy_cpus = 0;
															
 
																-			unsigned i = 0;
															
 
																-			for (i = 0; i < STARPU_MAXMICDEVS; i++)
															
 
																-				mic_busy_cpus += (topology->nmiccores[i] ? 1 : 0);
															
 
																+			unsigned j = 0;
															
 
																+			for (j = 0; j < STARPU_MAXMICDEVS; j++)
															
 
																+				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
															
 
																 			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
															
 
																 				+ topology->nopenclgpus + topology->nsccdevices;
															
@@ -1195,8 +1195,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 				unsigned worker2;
															
 
																 				for (worker2 = 0; worker2 < worker; worker2++)
															
 
																 				{
															
 
																-					struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																-					if (workerarg->arch == STARPU_CUDA_WORKER)
															
 
																+					struct _starpu_worker *workerarg2 = &config->workers[worker2];
															
 
																+					if (workerarg2->arch == STARPU_CUDA_WORKER)
															
 
																 					{
															
 
																 						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
															
 
																 						_starpu_register_bus(memory_node2, memory_node);
															
@@ -1291,16 +1291,17 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 		/* Put the worker descriptor in the userdata field of the
															
 
																 		 * hwloc object describing the CPU */
															
 
																-		hwloc_obj_t worker_obj;
															
 
																-		worker_obj =
															
 
																-			hwloc_get_obj_by_depth (config->topology.hwtopology,
															
 
																-						config->cpu_depth,
															
 
																-						workerarg->bindid);
															
 
																-		worker_obj->userdata = &config->workers[worker];
															
 
																+		hwloc_obj_t worker_obj = hwloc_get_obj_by_depth(config->topology.hwtopology,
															
 
																+								config->cpu_depth,
															
 
																+								workerarg->bindid);
															
 
																+		if (worker_obj->userdata == NULL)
															
 
																+		{
															
 
																+			worker_obj->userdata = _starpu_worker_list_new();
															
 
																+		}
															
 
																+		_starpu_worker_list_push_front(worker_obj->userdata, workerarg);
															
 
																 		/* Clear the cpu set and set the cpu */
															
 
																-		workerarg->hwloc_cpu_set =
															
 
																-			hwloc_bitmap_dup (worker_obj->cpuset);
															
 
																+		workerarg->hwloc_cpu_set = hwloc_bitmap_dup (worker_obj->cpuset);
															
 
																 #endif
															
 
																 	}
															
 
																 #ifdef STARPU_SIMGRID
															
@@ -1388,6 +1389,14 @@ _starpu_destroy_topology (
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 		struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																 		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
															
 
																+		hwloc_obj_t worker_obj = hwloc_get_obj_by_depth(config->topology.hwtopology,
															
 
																+								config->cpu_depth,
															
 
																+								workerarg->bindid);
															
 
																+		if (worker_obj->userdata)
															
 
																+		{
															
 
																+			_starpu_worker_list_delete(worker_obj->userdata);
															
 
																+			worker_obj->userdata = NULL;
															
 
																+		}
															
 
																 #endif
															
 
																 	}
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -483,6 +483,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 {
															
 
																 	pconfig->running = 1;
															
 
																 	pconfig->submitting = 1;
															
 
																+	STARPU_HG_DISABLE_CHECKING(pconfig->watchdog_ok);
															
 
																 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
															
@@ -1041,6 +1042,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
																 	if (!is_a_sink)
															
 
																 		_starpu_launch_drivers(&config);
															
 
																+	_starpu_watchdog_init();
															
 
																+
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
															
 
																 	initialized = INITIALIZED;
															
 
																 	/* Tell everybody that we initialized */
															
@@ -1215,6 +1218,8 @@ void starpu_shutdown(void)
 
																 	_starpu_deinitialize_registered_performance_models();
															
 
																+	_starpu_watchdog_shutdown();
															
 
																+
															
 
																 	/* wait for their termination */
															
 
																 	_starpu_terminate_workers(&config);
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -53,8 +53,7 @@
 
																 #include <starpu_parameters.h>
															
 
																 /* This is initialized from in _starpu_worker_init */
															
 
																-struct _starpu_worker
															
 
																-{
															
 
																+LIST_TYPE(_starpu_worker,
															
 
																 	struct _starpu_machine_config *config;
															
 
																         starpu_pthread_mutex_t mutex;
															
 
																 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
															
@@ -116,7 +115,7 @@ struct _starpu_worker
 
																 	/* hwloc_obj_t of the device controled by the worker*/
															
 
																 	hwloc_obj_t hw_obj;
															
 
																 #endif
															
 
																-};
															
 
																+);
															
 
																 struct _starpu_combined_worker
															
 
																 {
															
@@ -313,6 +312,8 @@ struct _starpu_machine_config
 
																 	/* this flag is set until the application is finished submitting tasks */
															
 
																 	unsigned submitting;
															
 
																+
															
 
																+	int watchdog_ok;
															
 
																 };
															
 
																 /* Three functions to manage argv, argc */
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -24,6 +24,7 @@
 
																 #include <profiling/profiling.h>
															
 
																 #include <math.h>
															
 
																 #include <core/task.h>
															
 
																+#include <starpu_scheduler.h>
															
 
																 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
															
 
																 unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
															
@@ -428,12 +429,30 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 	/* find someone who already has the data */
															
 
																 	unsigned src_node = 0;
															
 
																-	/* if the data is in write only mode, there is no need for a source */
															
 
																 	if (mode & STARPU_R)
															
 
																 	{
															
 
																 		src_node = _starpu_select_src_node(handle, requesting_node);
															
 
																 		STARPU_ASSERT(src_node != requesting_node);
															
 
																 	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* if the data is in write only mode, there is no need for a source */
															
 
																+		if (requesting_node == STARPU_MAIN_RAM) {
															
 
																+			/* And this is the main RAM, really no need for a
															
 
																+			 * request, just allocate */
															
 
																+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
															
 
																+			{
															
 
																+				_starpu_update_data_state(handle, dst_replicate, mode);
															
 
																+
															
 
																+				_starpu_spin_unlock(&handle->header_lock);
															
 
																+
															
 
																+				if (callback_func)
															
 
																+					callback_func(callback_arg);
															
 
																+				_STARPU_LOG_OUT_TAG("data immediately allocated");
															
 
																+				return NULL;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																 	/* We can safely assume that there won't be more than 2 hops in the
															
 
																 	 * current implementation */
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -24,8 +24,8 @@
 
																  * Data interfaces should also have to declare how many asynchronous requests
															
 
																  * they have actually started (think of e.g. csr).
															
 
																  */
															
 
																-#define MAX_PENDING_REQUESTS_PER_NODE 400
															
 
																-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 200
															
 
																+#define MAX_PENDING_REQUESTS_PER_NODE 20
															
 
																+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
															
 
																 /* requests that have not been treated at all */
															
 
																 static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
															
@@ -356,17 +356,12 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 {
															
 
																 	starpu_data_handle_t handle = r->handle;
															
 
																-	if (prefetch) {
															
 
																-		if (_starpu_spin_trylock(&handle->header_lock))
															
 
																-			return -EBUSY;
															
 
																-		if (_starpu_spin_trylock(&r->lock))
															
 
																-		{
															
 
																-			_starpu_spin_unlock(&handle->header_lock);
															
 
																-			return -EBUSY;
															
 
																-		}
															
 
																-	} else {
															
 
																-		_starpu_spin_lock(&handle->header_lock);
															
 
																-		_starpu_spin_lock(&r->lock);
															
 
																+	if (_starpu_spin_trylock(&handle->header_lock))
															
 
																+		return -EBUSY;
															
 
																+	if (_starpu_spin_trylock(&r->lock))
															
 
																+	{
															
 
																+		_starpu_spin_unlock(&handle->header_lock);
															
 
																+		return -EBUSY;
															
 
																 	}
															
 
																 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
															
@@ -420,19 +415,25 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	return 0;
															
 
																 }
															
 
																-void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
															
 
																+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
															
 
																 {
															
 
																 	struct _starpu_data_request *r;
															
 
																 	struct _starpu_data_request_list *new_data_requests;
															
 
																+	struct _starpu_data_request_list *empty_list;
															
 
																+	int ret = 0;
															
 
																 	/* Here helgrind would should that this is an un protected access.
															
 
																 	 * We however don't care about missing an entry, we will get called
															
 
																 	 * again sooner or later. */
															
 
																 	if (_starpu_data_request_list_empty(data_requests[src_node]))
															
 
																-		return;
															
 
																+		return 0;
															
 
																+
															
 
																+	empty_list = _starpu_data_request_list_new();
															
 
																 	/* take all the entries from the request list */
															
 
																-        STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
 
																+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
															
 
																+		/* List is busy, do not bother with it */
															
 
																+		return -EBUSY;
															
 
																 	struct _starpu_data_request_list *local_list = data_requests[src_node];
															
@@ -441,13 +442,14 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
																 		/* there is no request */
															
 
																                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
 
																-		return;
															
 
																+		_starpu_data_request_list_delete(empty_list);
															
 
																+		return 0;
															
 
																 	}
															
 
																 	/* There is an entry: we create a new empty list to replace the list of
															
 
																 	 * requests, and we handle the request(s) one by one in the former
															
 
																 	 * list, without concurrency issues.*/
															
 
																-	data_requests[src_node] = _starpu_data_request_list_new();
															
 
																+	data_requests[src_node] = empty_list;
															
 
																 	STARPU_HG_DISABLE_CHECKING(data_requests[src_node]->_head);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
@@ -463,6 +465,7 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
																 		{
															
 
																 			/* Too many requests at the same time, skip pushing
															
 
																 			 * more for now */
															
 
																+			ret = -EBUSY;
															
 
																 			break;
															
 
																 		}
															
@@ -471,6 +474,8 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
																 		res = starpu_handle_data_request(r, may_alloc, 0);
															
 
																 		if (res != 0 && res != -EAGAIN)
															
 
																 		{
															
 
																+			/* handle is busy, or not enough memory, postpone for now */
															
 
																+			ret = res;
															
 
																 			_starpu_data_request_list_push_back(new_data_requests, r);
															
 
																 			break;
															
 
																 		}
															
@@ -491,6 +496,8 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
																 	_starpu_data_request_list_delete(new_data_requests);
															
 
																 	_starpu_data_request_list_delete(local_list);
															
 
																+
															
 
																+	return ret;
															
 
																 }
															
 
																 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc)
															
@@ -498,12 +505,17 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 
																 	struct _starpu_data_request *r;
															
 
																 	struct _starpu_data_request_list *new_data_requests;
															
 
																 	struct _starpu_data_request_list *new_prefetch_requests;
															
 
																+	struct _starpu_data_request_list *empty_list;
															
 
																 	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
															
 
																 		return;
															
 
																+	empty_list = _starpu_data_request_list_new();
															
 
																+
															
 
																 	/* take all the entries from the request list */
															
 
																-        STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
 
																+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
															
 
																+		/* List is busy, do not bother with it */
															
 
																+		return;
															
 
																 	struct _starpu_data_request_list *local_list = prefetch_requests[src_node];
															
@@ -511,13 +523,14 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 
																 	{
															
 
																 		/* there is no request */
															
 
																                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
 
																+		_starpu_data_request_list_delete(empty_list);
															
 
																 		return;
															
 
																 	}
															
 
																 	/* There is an entry: we create a new empty list to replace the list of
															
 
																 	 * requests, and we handle the request(s) one by one in the former
															
 
																 	 * list, without concurrency issues.*/
															
 
																-	prefetch_requests[src_node] = _starpu_data_request_list_new();
															
 
																+	prefetch_requests[src_node] = empty_list;
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
@@ -582,12 +595,16 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
																 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
															
 
																 //
															
 
																 	struct _starpu_data_request_list *new_data_requests_pending;
															
 
																+	struct _starpu_data_request_list *empty_list;
															
 
																 	unsigned taken, kept;
															
 
																 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
															
 
																 		return;
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+	empty_list = _starpu_data_request_list_new();
															
 
																+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]) && !force)
															
 
																+		/* List is busy, do not bother with it */
															
 
																+		return;
															
 
																 	/* for all entries of the list */
															
 
																 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
															
@@ -595,9 +612,10 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
																 	{
															
 
																 		/* there is no request */
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+		_starpu_data_request_list_delete(empty_list);
															
 
																 		return;
															
 
																 	}
															
 
																-	data_requests_pending[src_node] = _starpu_data_request_list_new();
															
 
																+	data_requests_pending[src_node] = empty_list;
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
															
@@ -613,8 +631,19 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
																 		starpu_data_handle_t handle = r->handle;
															
 
																-		_starpu_spin_lock(&handle->header_lock);
															
 
																+		if (force)
															
 
																+			/* Have to wait for the handle, whatever it takes */
															
 
																+			_starpu_spin_lock(&handle->header_lock);
															
 
																+		else
															
 
																+			if (_starpu_spin_trylock(&handle->header_lock))
															
 
																+			{
															
 
																+				/* Handle is busy, retry this later */
															
 
																+				_starpu_data_request_list_push_back(new_data_requests_pending, r);
															
 
																+				kept++;
															
 
																+				continue;
															
 
																+			}
															
 
																+		/* This shouldn't be too hard to acquire */
															
 
																 		_starpu_spin_lock(&r->lock);
															
 
																 		/* wait until the transfer is terminated */
															
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -109,7 +109,8 @@ LIST_TYPE(_starpu_data_requester,
 
																 void _starpu_init_data_request_lists(void);
															
 
																 void _starpu_deinit_data_request_lists(void);
															
 
																 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node);
															
 
																-void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
															
 
																+/* returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
															
 
																+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
															
 
																 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc);
															
 
																 void _starpu_handle_pending_node_data_requests(unsigned src_node);
															
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -36,8 +36,10 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 
																 	/* in case some other driver requested data */
															
 
																 	_starpu_handle_pending_node_data_requests(memory_node);
															
 
																-	_starpu_handle_node_data_requests(memory_node, may_alloc);
															
 
																-	_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
															
 
																+	if (_starpu_handle_node_data_requests(memory_node, may_alloc) == 0)
															
 
																+		/* We pushed all pending requests, we can afford pushing
															
 
																+		 * prefetch requests */
															
 
																+		_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
															
 
																 	_starpu_execute_registered_progression_hooks();
															
 
																 }
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -258,7 +258,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 		child->footprint = _starpu_compute_data_footprint(child);
															
 
																 		void *ptr;
															
 
																-		ptr = starpu_data_handle_to_pointer(child, 0);
															
 
																+		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
															
 
																 		if (ptr != NULL)
															
 
																 			_starpu_data_register_ram_pointer(child, ptr);
															
 
																 	}
															
@@ -281,6 +281,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 	unsigned node;
															
 
																 	unsigned sizes[root_handle->nchildren];
															
 
																+	_STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node);
															
 
																 	_starpu_spin_lock(&root_handle->header_lock);
															
 
																 	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle);
															
@@ -435,6 +436,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 	/* now the parent may be used again so we release the lock */
															
 
																 	_starpu_spin_unlock(&root_handle->header_lock);
															
 
																+	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
															
 
																 }
															
 
																 /* each child may have his own interface type */
															
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -18,6 +18,7 @@
 
																 #include <datawizard/footprint.h>
															
 
																 #include <starpu_hash.h>
															
 
																 #include <core/task.h>
															
 
																+#include <starpu_scheduler.h>
															
 
																 uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
															
 
																 {
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -589,9 +589,9 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
																 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
															
 
																 		{
															
 
																 			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
															
 
																-								dst_block->dev_handle, dst_block->offset, dst_node,
															
 
																-							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																-							       event);
															
 
																+							    dst_block->dev_handle, dst_block->offset, dst_node,
															
 
																+							    src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																+							    event);
															
 
																                 }
															
 
																 		else
															
 
																 		{
															
@@ -615,8 +615,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
																 								    dst_block->dev_handle,
															
 
																 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
															
 
																 								    dst_node,
															
 
																-								       src_block->nx*src_block->elemsize,
															
 
																-								       event);
															
 
																+								    src_block->nx*src_block->elemsize,
															
 
																+								    event);
															
 
																                         }
															
 
																                 }
															
 
																         }
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -283,7 +283,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
																 	/* now the data is available ! */
															
 
																 	_starpu_spin_unlock(&handle->header_lock);
															
 
																-	ptr = starpu_data_handle_to_pointer(handle, 0);
															
 
																+	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																 	if (ptr != NULL)
															
 
																 	{
															
 
																 		_starpu_data_register_ram_pointer(handle, ptr);
															
@@ -442,7 +442,8 @@ int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 
																 	return 0;
															
 
																 }
															
 
																-int starpu_data_release_tag(starpu_data_handle_t handle)
															
 
																+static
															
 
																+int _starpu_data_release_tag(starpu_data_handle_t handle)
															
 
																 {
															
 
																 	struct handle_tag_entry *tag_entry;
															
@@ -476,7 +477,7 @@ void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 
																 	unsigned worker;
															
 
																 	unsigned nworkers = starpu_worker_get_count();
															
 
																-	ram_ptr = starpu_data_handle_to_pointer(handle, 0);
															
 
																+	ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																 	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 		free(handle->per_node[node].data_interface);
															
@@ -734,7 +735,7 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 	STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
															
 
																 	STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
															
 
																-	starpu_data_release_tag(handle);
															
 
																+	_starpu_data_release_tag(handle);
															
 
																 	free(handle);
															
 
																 }
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -404,7 +404,11 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
																 		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
															
 
																 		src_matrix->nx*elemsize, src_matrix->ny, kind);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																-		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+	{
															
 
																+		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
															
 
																+		if (ret == -EAGAIN) return ret;
															
 
																+		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+	}
															
 
																 #endif
															
 
																 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
															
@@ -644,15 +648,17 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
																 			ret = -EAGAIN;
															
 
																 	}
															
 
																 	else
															
 
																-	for (y = 0; y < ny; y++)
															
 
																 	{
															
 
																-		uint32_t src_offset = y*ld_src*elemsize;
															
 
																-		uint32_t dst_offset = y*ld_dst*elemsize;
															
 
																-
															
 
																-		if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset + src_offset, src_node,
															
 
																-		                          dst_matrix->dev_handle, dst_matrix->offset + dst_offset, dst_node,
															
 
																-		                          nx*elemsize, async_data))
															
 
																-			ret = -EAGAIN;
															
 
																+	     for (y = 0; y < ny; y++)
															
 
																+	     {
															
 
																+		     uint32_t src_offset = y*ld_src*elemsize;
															
 
																+		     uint32_t dst_offset = y*ld_dst*elemsize;
															
 
																+
															
 
																+		     if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset + src_offset, src_node,
															
 
																+					       dst_matrix->dev_handle, dst_matrix->offset + dst_offset, dst_node,
															
 
																+					       nx*elemsize, async_data))
															
 
																+			     ret = -EAGAIN;
															
 
																+	     }
															
 
																 	}
															
 
																 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -24,6 +24,7 @@
 
																 #include <starpu.h>
															
 
																 #include <drivers/opencl/driver_opencl.h>
															
 
																 #include <datawizard/memory_manager.h>
															
 
																+#include <datawizard/malloc.h>
															
 
																 static size_t _malloc_align = sizeof(void*);
															
@@ -91,7 +92,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 	if (flags & STARPU_MALLOC_COUNT)
															
 
																 	{
															
 
																-		if (_starpu_memory_manager_can_allocate_size(dim, 0) == 0)
															
 
																+		if (_starpu_memory_manager_can_allocate_size(dim, STARPU_MAIN_RAM) == 0)
															
 
																 		{
															
 
																 			size_t freed;
															
 
																 			size_t reclaim = 2 * dim;
															
@@ -108,9 +109,15 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 		}
															
 
																 	}
															
 
																-#ifndef STARPU_SIMGRID
															
 
																 	if (flags & STARPU_MALLOC_PINNED)
															
 
																 	{
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+		/* FIXME: CUDA seems to be taking 650µs every 1MiB.
															
 
																+		 * Ideally we would simulate this batching in 1MiB requests
															
 
																+		 * instead of computing an average value.
															
 
																+		 */
															
 
																+		MSG_process_sleep((float) dim * 0.000650 / 1048576.);
															
 
																+#else /* STARPU_SIMGRID */
															
 
																 		if (_starpu_can_submit_cuda_task())
															
 
																 		{
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -176,8 +183,8 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 //			goto end;
															
 
																 //#endif /* STARPU_USE_OPENCL */
															
 
																 //		}
															
 
																-	}
															
 
																 #endif /* STARPU_SIMGRID */
															
 
																+	}
															
 
																 	if (_starpu_can_submit_scc_task())
															
 
																 	{
															
@@ -338,7 +345,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
																 out:
															
 
																 	if (flags & STARPU_MALLOC_COUNT)
															
 
																 	{
															
 
																-		_starpu_memory_manager_deallocate_size(dim, 0);
															
 
																+		_starpu_memory_manager_deallocate_size(dim, STARPU_MAIN_RAM);
															
 
																 	}
															
 
																 	return 0;
															
@@ -381,14 +388,20 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																 		}
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 		case STARPU_CUDA_RAM:
															
 
																+		{
															
 
																 #ifdef STARPU_SIMGRID
															
 
																+			static uintptr_t last[STARPU_MAXNODES];
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning TODO: record used memory, using a simgrid property to know the available memory
															
 
																 #endif
															
 
																-			/* Sleep 10µs for the allocation */
															
 
																+			/* Sleep for the allocation */
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
															
 
																-			MSG_process_sleep(0.000010);
															
 
																-			addr = 1;
															
 
																+			MSG_process_sleep(0.000175);
															
 
																+			if (!last[dst_node])
															
 
																+				last[dst_node] = 1<<10;
															
 
																+			addr = last[dst_node];
															
 
																+			last[dst_node]+=size;
															
 
																+			STARPU_ASSERT(last[dst_node] >= addr);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
															
 
																 #else
															
 
																 			status = cudaMalloc((void **)&addr, size);
															
@@ -400,15 +413,21 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																 			}
															
 
																 #endif
															
 
																 			break;
															
 
																+		}
															
 
																 #endif
															
 
																 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
 
																 	        case STARPU_OPENCL_RAM:
															
 
																-			{
															
 
																+		{
															
 
																 #ifdef STARPU_SIMGRID
															
 
																-				/* Sleep 10µs for the allocation */
															
 
																+				static uintptr_t last[STARPU_MAXNODES];
															
 
																+				/* Sleep for the allocation */
															
 
																 				STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
															
 
																-				MSG_process_sleep(0.000010);
															
 
																-				addr = 1;
															
 
																+				MSG_process_sleep(0.000175);
															
 
																+				if (!last[dst_node])
															
 
																+					last[dst_node] = 1<<10;
															
 
																+				addr = last[dst_node];
															
 
																+				last[dst_node]+=size;
															
 
																+				STARPU_ASSERT(last[dst_node] >= addr);
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
															
 
																 #else
															
 
																                                 int ret;
															
@@ -425,14 +444,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																 				}
															
 
																 				break;
															
 
																 #endif
															
 
																-			}
															
 
																+		}
															
 
																 #endif
															
 
																 	        case STARPU_DISK_RAM:
															
 
																 		{
															
 
																 			addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
															
 
																 			break;
															
 
																 		}
															
 
																-			
															
 
																+
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 		case STARPU_MIC_RAM:
															
 
																 			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
															
@@ -481,8 +500,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
																 		{
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
															
 
																-			/* Sleep 10µs for the free */
															
 
																-			MSG_process_sleep(0.000010);
															
 
																+			/* Sleep for the free */
															
 
																+			MSG_process_sleep(0.000125);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
															
 
																 #else
															
 
																 			cudaError_t err;
															
@@ -498,8 +517,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
																 		{
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
															
 
																-			/* Sleep 10µs for the free */
															
 
																-			MSG_process_sleep(0.000010);
															
 
																+			/* Sleep for the free */
															
 
																+			MSG_process_sleep(0.000125);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
															
 
																 #else
															
 
																 			cl_int err;
															
@@ -774,6 +793,7 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
																 	{
															
 
																 		STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
															
 
																 		nextblock = bitmap[prevblock].next;
															
 
																+		STARPU_ASSERT_MSG(nextblock != block, "It seems data 0x%lx (size %u) on node %u is being freed a second time\n", (unsigned long) addr, (unsigned) size, dst_node);
															
 
																 		if (nextblock > block || nextblock == -1)
															
 
																 			break;
															
 
																 	}
															
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -20,4 +20,6 @@
 
																 void _starpu_malloc_init(unsigned dst_node);
															
 
																 void _starpu_malloc_shutdown(unsigned dst_node);
															
 
																+void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
															
 
																+
															
 
																 #endif
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -24,7 +24,7 @@
 
																 /* This per-node RW-locks protect mc_list and memchunk_cache entries */
															
 
																 /* Note: handle header lock is always taken before this */
															
 
																-static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
															
 
																+static struct _starpu_spinlock mc_lock[STARPU_MAXNODES];
															
 
																 /* Potentially in use memory chunks */
															
 
																 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
															
@@ -34,6 +34,7 @@ struct mc_cache_entry
 
																 {
															
 
																 	UT_hash_handle hh;
															
 
																 	struct _starpu_mem_chunk_list *list;
															
 
																+	uint32_t footprint;
															
 
																 };
															
 
																 static struct mc_cache_entry *mc_cache[STARPU_MAXNODES];
															
@@ -49,7 +50,7 @@ void _starpu_init_mem_chunk_lists(void)
 
																 	unsigned i;
															
 
																 	for (i = 0; i < STARPU_MAXNODES; i++)
															
 
																 	{
															
 
																-		STARPU_PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
															
 
																+		_starpu_spin_init(&mc_lock[i]);
															
 
																 		mc_list[i] = _starpu_mem_chunk_list_new();
															
 
																 	}
															
 
																 }
															
@@ -67,7 +68,7 @@ void _starpu_deinit_mem_chunk_lists(void)
 
																 			_starpu_mem_chunk_list_delete(entry->list);
															
 
																 			free(entry);
															
 
																 		}
															
 
																-		STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
															
 
																+		_starpu_spin_destroy(&mc_lock[i]);
															
 
																 	}
															
 
																 }
															
@@ -399,7 +400,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
																 }
															
 
																 #ifdef STARPU_USE_ALLOCATION_CACHE
															
 
																-/* We assume that mc_rwlock[node] is taken. is_already_in_mc_list indicates
															
 
																+/* We assume that mc_lock[node] is taken. is_already_in_mc_list indicates
															
 
																  * that the mc is already in the list of buffers that are possibly used, and
															
 
																  * therefore not in the cache. */
															
 
																 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
															
@@ -408,14 +409,6 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 	 * of the "to free" list, and reassign it to the new
															
 
																 	 * piece of data */
															
 
																-	if (!is_already_in_mc_list)
															
 
																-	{
															
 
																-		uint32_t footprint = _starpu_compute_data_footprint(new_replicate->handle);
															
 
																-		struct mc_cache_entry *entry;
															
 
																-		HASH_FIND(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
															
 
																-		_starpu_mem_chunk_list_erase(entry->list, mc);
															
 
																-	}
															
 
																-
															
 
																 	struct _starpu_data_replicate *old_replicate = mc->replicate;
															
 
																 	old_replicate->allocated = 0;
															
 
																 	old_replicate->automatically_allocated = 0;
															
@@ -427,7 +420,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 	STARPU_ASSERT(new_replicate->data_interface);
															
 
																 	STARPU_ASSERT(mc->chunk_interface);
															
 
																-	memcpy(new_replicate->data_interface, mc->chunk_interface, old_replicate->handle->ops->interface_size);
															
 
																+	memcpy(new_replicate->data_interface, mc->chunk_interface, mc->size_interface);
															
 
																 	mc->data = new_replicate->handle;
															
 
																 	/* mc->ops, mc->footprint and mc->interface should be
															
@@ -484,7 +477,7 @@ static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_
 
																 	return ret;
															
 
																 }
															
 
																-/* This function must be called with mc_rwlock[node] taken in write mode */
															
 
																+/* This function must be called with mc_lock[node] taken */
															
 
																 static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(unsigned node, starpu_data_handle_t handle, uint32_t footprint)
															
 
																 {
															
 
																 	/* go through all buffers in the cache */
															
@@ -517,7 +510,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(unsigned n
 
																 /* this function looks for a memory chunk that matches a given footprint in the
															
 
																  * list of mem chunk that need to be freed. This function must be called with
															
 
																- * mc_rwlock[node] taken in write mode. */
															
 
																+ * mc_lock[node] taken. */
															
 
																 static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
															
 
																 {
															
 
																 	struct _starpu_mem_chunk *mc, *next_mc;
															
@@ -558,7 +551,7 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 
																 /*
															
 
																  * Free the memory chuncks that are explicitely tagged to be freed. The
															
 
																- * mc_rwlock[node] rw-lock should be taken prior to calling this function.
															
 
																+ * mc_lock[node] rw-lock should be taken prior to calling this function.
															
 
																  */
															
 
																 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
															
 
																 {
															
@@ -568,7 +561,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
																 	size_t freed = 0;
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_lock(&mc_lock[node]);
															
 
																 	HASH_ITER(hh, mc_cache[node], entry, tmp)
															
 
																 	{
															
 
																 		busy_mc_cache = _starpu_mem_chunk_list_new();
															
@@ -600,7 +593,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
																 		if (reclaim && freed >= reclaim)
															
 
																 			break;
															
 
																 	}
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[node]);
															
 
																 	return freed;
															
 
																 }
															
@@ -608,7 +601,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
																  * Try to free the buffers currently in use on the memory node. If the force
															
 
																  * flag is set, the memory is freed regardless of coherency concerns (this
															
 
																  * should only be used at the termination of StarPU for instance). The
															
 
																- * mc_rwlock[node] rw-lock should be taken prior to calling this function.
															
 
																+ * mc_lock[node] should be taken prior to calling this function.
															
 
																  */
															
 
																 static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t reclaim)
															
 
																 {
															
@@ -617,7 +610,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
																 	struct _starpu_mem_chunk *mc, *next_mc;
															
 
																 	/*
															
 
																-	 * We have to unlock mc_rwlock before locking header_lock, so we have
															
 
																+	 * We have to unlock mc_lock before locking header_lock, so we have
															
 
																 	 * to be careful with the list.  We try to do just one pass, by
															
 
																 	 * remembering the next mc to be tried. If it gets dropped, we restart
															
 
																 	 * from zero. So we continue until we go through the whole list without
															
@@ -625,7 +618,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
																 	 */
															
 
																 restart:
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_lock(&mc_lock[node]);
															
 
																 	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
															
 
																 	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
															
@@ -652,7 +645,7 @@ restart:
 
																 				 * still locking the handle. That's not
															
 
																 				 * supposed to happen, but better be safe by
															
 
																 				 * letting it go through. */
															
 
																-				STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+				_starpu_spin_unlock(&mc_lock[node]);
															
 
																 				goto restart;
															
 
																 			}
															
@@ -664,7 +657,7 @@ restart:
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																 		}
															
 
																 	}
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[node]);
															
 
																 	return freed;
															
 
																 }
															
@@ -721,6 +714,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 
																 	/* Save a copy of the interface */
															
 
																 	mc->chunk_interface = malloc(interface_size);
															
 
																+	mc->size_interface = interface_size;
															
 
																 	STARPU_ASSERT(mc->chunk_interface);
															
 
																 	memcpy(mc->chunk_interface, replicate->data_interface, interface_size);
															
@@ -739,11 +733,11 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
 
																 	/* Put this memchunk in the list of memchunk in use */
															
 
																 	mc = _starpu_memchunk_init(replicate, interface_size, automatically_allocated);
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
															
 
																+	_starpu_spin_lock(&mc_lock[dst_node]);
															
 
																 	_starpu_mem_chunk_list_push_back(mc_list[dst_node], mc);
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[dst_node]);
															
 
																 }
															
 
																 /* This function is called when the handle is destroyed (eg. when calling
															
@@ -766,13 +760,13 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
																 	replicate->allocated = 0;
															
 
																 	replicate->automatically_allocated = 0;
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_lock(&mc_lock[node]);
															
 
																 	mc->data = NULL;
															
 
																 	/* remove it from the main list */
															
 
																 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[node]);
															
 
																 	/* We would only flush the RAM nodes cache if memory gets tight, either
															
 
																 	 * because StarPU automatically knows the total memory size of the
															
@@ -798,15 +792,16 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
																 		/* put it in the list of buffers to be removed */
															
 
																 		uint32_t footprint = mc->footprint;
															
 
																 		struct mc_cache_entry *entry;
															
 
																-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+		_starpu_spin_lock(&mc_lock[node]);
															
 
																 		HASH_FIND(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
															
 
																 		if (!entry) {
															
 
																 			entry = malloc(sizeof(*entry));
															
 
																 			entry->list = _starpu_mem_chunk_list_new();
															
 
																-			HASH_ADD_KEYPTR(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
															
 
																+			entry->footprint = footprint;
															
 
																+			HASH_ADD(hh, mc_cache[node], footprint, sizeof(entry->footprint), entry);
															
 
																 		}
															
 
																 		_starpu_mem_chunk_list_push_front(entry->list, mc);
															
 
																-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+		_starpu_spin_unlock(&mc_lock[node]);
															
 
																 	}
															
 
																 }
															
@@ -838,26 +833,34 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 	uint32_t footprint = _starpu_compute_data_footprint(handle);
															
 
																 	_STARPU_TRACE_START_ALLOC_REUSE(dst_node, data_size);
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
															
 
																+	_starpu_spin_lock(&mc_lock[dst_node]);
															
 
																 	if (try_to_find_reusable_mem_chunk(dst_node, handle, replicate, footprint))
															
 
																 	{
															
 
																-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
															
 
																+		_starpu_spin_unlock(&mc_lock[dst_node]);
															
 
																 		_starpu_allocation_cache_hit(dst_node);
															
 
																 		return data_size;
															
 
																 	}
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[dst_node]);
															
 
																 	_STARPU_TRACE_END_ALLOC_REUSE(dst_node);
															
 
																 #endif
															
 
																+	STARPU_ASSERT(handle->ops);
															
 
																+	STARPU_ASSERT(handle->ops->allocate_data_on_node);
															
 
																+	STARPU_ASSERT(replicate->data_interface);
															
 
																+
															
 
																+	char data_interface[handle->ops->interface_size];
															
 
																+
															
 
																+	memcpy(data_interface, replicate->data_interface, handle->ops->interface_size);
															
 
																+
															
 
																+	/* Take temporary reference on the replicate */
															
 
																+	replicate->refcnt++;
															
 
																+	handle->busy_count++;
															
 
																+	_starpu_spin_unlock(&handle->header_lock);
															
 
																 	do
															
 
																 	{
															
 
																-		STARPU_ASSERT(handle->ops);
															
 
																-		STARPU_ASSERT(handle->ops->allocate_data_on_node);
															
 
																-
															
 
																 		_STARPU_TRACE_START_ALLOC(dst_node, data_size);
															
 
																-		STARPU_ASSERT(replicate->data_interface);
															
 
																 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
															
 
																 		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
															
@@ -870,7 +873,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 		}
															
 
																 #endif
															
 
																-		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
															
 
																+		allocated_memory = handle->ops->allocate_data_on_node(data_interface, dst_node);
															
 
																 		_STARPU_TRACE_END_ALLOC(dst_node);
															
 
																 		if (allocated_memory == -ENOMEM)
															
@@ -880,11 +883,6 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 			if (starpu_memstrategy_data_size_coefficient*handle_size > reclaim)
															
 
																 				reclaim = starpu_memstrategy_data_size_coefficient*handle_size;
															
 
																-			/* Take temporary reference on the replicate */
															
 
																-			replicate->refcnt++;
															
 
																-			handle->busy_count++;
															
 
																-			_starpu_spin_unlock(&handle->header_lock);
															
 
																-
															
 
																 			_STARPU_TRACE_START_MEMRECLAIM(dst_node,is_prefetch);
															
 
																 			if (is_prefetch)
															
 
																 			{
															
@@ -893,26 +891,35 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 			else
															
 
																 				_starpu_memory_reclaim_generic(dst_node, 0, reclaim);
															
 
																 			_STARPU_TRACE_END_MEMRECLAIM(dst_node,is_prefetch);
															
 
																+		}
															
 
																+	}
															
 
																+	while((allocated_memory == -ENOMEM) && attempts++ < 2);
															
 
																-			int cpt = 0;
															
 
																-			while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																-			{
															
 
																-				cpt++;
															
 
																-				_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
															
 
																-			}
															
 
																-			if (cpt == STARPU_SPIN_MAXTRY)
															
 
																-				_starpu_spin_lock(&handle->header_lock);
															
 
																+	int cpt = 0;
															
 
																+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																+	{
															
 
																+		cpt++;
															
 
																+		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
															
 
																+	}
															
 
																+	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																+		_starpu_spin_lock(&handle->header_lock);
															
 
																-			replicate->refcnt--;
															
 
																-			STARPU_ASSERT(replicate->refcnt >= 0);
															
 
																-			STARPU_ASSERT(handle->busy_count > 0);
															
 
																-			handle->busy_count--;
															
 
																-			ret = _starpu_data_check_not_busy(handle);
															
 
																-			STARPU_ASSERT(ret == 0);
															
 
																-		}
															
 
																+	replicate->refcnt--;
															
 
																+	STARPU_ASSERT(replicate->refcnt >= 0);
															
 
																+	STARPU_ASSERT(handle->busy_count > 0);
															
 
																+	handle->busy_count--;
															
 
																+	ret = _starpu_data_check_not_busy(handle);
															
 
																+	STARPU_ASSERT(ret == 0);
															
 
																+	if (replicate->allocated)
															
 
																+	{
															
 
																+		/* Argl, somebody allocated it in between already, drop this one */
															
 
																+		handle->ops->free_data_on_node(data_interface, dst_node);
															
 
																+		allocated_memory = 0;
															
 
																 	}
															
 
																-	while((allocated_memory == -ENOMEM) && attempts++ < 2);
															
 
																+	else
															
 
																+		/* Install allocated interface */
															
 
																+		memcpy(replicate->data_interface, data_interface, handle->ops->interface_size);
															
 
																 	return allocated_memory;
															
 
																 }
															
@@ -941,9 +948,9 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
																 	replicate->allocated = 1;
															
 
																 	replicate->automatically_allocated = 1;
															
 
																-	if (dst_node == 0)
															
 
																+	if (dst_node == STARPU_MAIN_RAM)
															
 
																 	{
															
 
																-		void *ptr = starpu_data_handle_to_pointer(handle, 0);
															
 
																+		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																 		if (ptr != NULL)
															
 
																 		{
															
 
																 			_starpu_data_register_ram_pointer(handle, ptr);
															
@@ -965,16 +972,16 @@ void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 
																 	if (!mc)
															
 
																 		/* user-allocated memory */
															
 
																 		return;
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_lock(&mc_lock[node]);
															
 
																 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
															
 
																 	_starpu_mem_chunk_list_push_back(mc_list[node], mc);
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[node]);
															
 
																 }
															
 
																 #ifdef STARPU_MEMORY_STATS
															
 
																 void _starpu_memory_display_stats_by_node(int node)
															
 
																 {
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_lock(&mc_lock[node]);
															
 
																 	if (!_starpu_mem_chunk_list_empty(mc_list[node]))
															
 
																 	{
															
@@ -993,7 +1000,7 @@ void _starpu_memory_display_stats_by_node(int node)
 
																 	}
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	_starpu_spin_unlock(&mc_lock[node]);
															
 
																 }
															
 
																 #endif
															
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -41,9 +41,10 @@ LIST_TYPE(_starpu_mem_chunk,
 
																 	 */
															
 
																 	struct starpu_data_interface_ops *ops;
															
 
																 	void *chunk_interface;
															
 
																+	size_t size_interface;
															
 
																 	unsigned automatically_allocated;
															
 
																-	/* the size is only set when calling _starpu_request_mem_chunk_removal(),
															
 
																+	/* the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
															
 
																          * it is needed by free_memory_on_node() which is called when
															
 
																          * the handle is no longer valid. It should not be used otherwise.
															
 
																 	 */
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 
																 int starpu_data_acquire_cb(starpu_data_handle_t handle,
															
 
																 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
															
 
																 {
															
 
																-	return starpu_data_acquire_on_node_cb(handle, 0, mode, callback, arg);
															
 
																+	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
															
 
																 }
															
 
																 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1380,9 +1380,14 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
																 				handle_worker_status(&ev, options, "P");
															
 
																 				break;
															
 
																+			case _STARPU_FUT_START_UNPARTITION:
															
 
																+				handle_worker_status(&ev, options, "U");
															
 
																+				break;
															
 
																+
															
 
																 			case _STARPU_FUT_END_FETCH_INPUT:
															
 
																 			case _STARPU_FUT_END_PROGRESS:
															
 
																 			case _STARPU_FUT_END_PUSH_OUTPUT:
															
 
																+			case _STARPU_FUT_END_UNPARTITION:
															
 
																 				handle_worker_status(&ev, options, "B");
															
 
																 				break;
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -163,6 +163,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
															
 
																 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
															
 
																 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
															
 
																+	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
															
 
																 	/* Types for the MPI Communication Thread of the Memory Node */
															
 
																 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
															
@@ -190,6 +191,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
															
 
																 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
															
 
																 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
															
 
																+		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
															
 
																 	}
															
 
																 	/* Types for the Scheduler */
															
@@ -228,7 +230,8 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 6       C       S       Callback       \".0 .3 .8\"            \n\
															
 
																 6       B       S       Overhead         \".5 .18 .0\"		\n\
															
 
																 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
															
 
																-6       P       S       Progressing         \".4 .1 .6\"		\n");
															
 
																+6       P       S       Progressing         \".4 .1 .6\"		\n\
															
 
																+6       U       S       Unpartitioning      \".0 .0 1.0\"		\n");
															
 
																 	fprintf(file, "\
															
 
																 6       P       CtS       Processing         \"0 0 0\"		\n\
															
 
																 6       Sl       CtS      Sleeping         \".9 .1 .0\"		\n\
															
@@ -247,8 +250,9 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
															
 
																 6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
															
 
																 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
															
 
																-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n",
															
 
																-		i, i, i, i, i, i, i, i);
															
 
																+6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
															
 
																+6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"		\n",
															
 
																+		i, i, i, i, i, i, i, i, i);
															
 
																 	fprintf(file, "\
															
 
																 6       A       MS      Allocating         \".4 .1 .0\"		\n\
															
 
																 6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n\
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -190,7 +190,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
																 	return _starpu_get_worker_struct(n);
															
 
																 }
															
 
																-static size_t _starpu_cpu_get_global_mem_size(int nodeid, struct _starpu_machine_config *config)
															
 
																+static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config)
															
 
																 {
															
 
																 	size_t global_mem;
															
 
																 	starpu_ssize_t limit;
															
@@ -201,12 +201,11 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid, struct _starpu_machine
 
																 #endif
															
 
																 #if defined(STARPU_HAVE_HWLOC)
															
 
																-        int depth_node;
															
 
																 	struct _starpu_machine_topology *topology = &config->topology;
															
 
																 #if 0
															
 
																 	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
															
 
																-        depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
															
 
																+        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
															
 
																 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
															
 
																 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -80,7 +80,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
																 	char name[30];
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	global_mem[devid] = props[devid].totalGlobalMem;
															
 
																+	/* Find the size of the memory on the device */
															
 
																+	totalGlobalMem = props[devid].totalGlobalMem;
															
 
																 #endif
															
 
																 	limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
															
@@ -89,17 +90,17 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
																 		sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
															
 
																 		limit = starpu_get_env_number(name);
															
 
																 	}
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																 	if (limit == -1)
															
 
																 	{
															
 
																-		return;
															
 
																+		/* Use 90% of the available memory by default.  */
															
 
																+		limit = totalGlobalMem / (1024*1024) * 0.9;
															
 
																 	}
															
 
																+#endif
															
 
																 	global_mem[devid] = limit * 1024*1024;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	/* Find the size of the memory on the device */
															
 
																-	totalGlobalMem = props[devid].totalGlobalMem;
															
 
																-
															
 
																 	/* How much memory to waste ? */
															
 
																 	to_waste = totalGlobalMem - global_mem[devid];
															
@@ -201,7 +202,7 @@ static void init_context(unsigned devid)
 
																 	starpu_cuda_set_device(devid);
															
 
																 #ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
															
 
																+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
															
 
																 	{
															
 
																 		int nworkers = starpu_worker_get_count();
															
 
																 		for (workerid = 0; workerid < nworkers; workerid++)
															
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -17,6 +17,7 @@
 
																 #include <starpu.h>
															
 
																 #include <core/disk.h>
															
 
																 #include <starpu_profiling.h>
															
 
																+#include <drivers/disk/driver_disk.h>
															
 
																 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
															
 
																 {
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -81,14 +81,15 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 
																 		sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
															
 
																 		limit = starpu_get_env_number(name);
															
 
																 	}
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																 	if (limit == -1)
															
 
																 	{
															
 
																-		global_mem[devid] = totalGlobalMem;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		global_mem[devid] = limit * 1024*1024;
															
 
																+		/* Use 90% of the available memory by default.  */
															
 
																+		limit = totalGlobalMem / (1024*1024) * 0.9;
															
 
																 	}
															
 
																+#endif
															
 
																+
															
 
																+	global_mem[devid] = limit * 1024*1024;
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	/* How much memory to waste ? */
															
--- a/src/profiling/profiling.h
+++ b/src/profiling/profiling.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -69,5 +69,6 @@ void _starpu_profiling_set_task_push_end_time(struct starpu_task *task);
 
																 void _starpu_profiling_init(void);
															
 
																 void _starpu_profiling_terminate(void);
															
 
																+void _starpu_profiling_reset_counters();
															
 
																 #endif // __PROFILING_H__
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -230,6 +230,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
																 						taskq->ntasks[priolevel]--;
															
 
																 						taskq->total_ntasks--;
															
 
																 						_STARPU_TRACE_JOB_POP(task, 0);
															
 
																+						break;
															
 
																 					} else skipped = 1;
															
 
																 				}
															
 
																 			}
															
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -157,15 +157,14 @@ static int push_task_peager_policy(struct starpu_task *task)
 
																         /*if there are no tasks block */
															
 
																         /* wake people waiting for a task */
															
 
																-        int worker = -1;
															
 
																         struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																         struct starpu_sched_ctx_iterator it;
															
 
																         if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it);
															
 
																-
															
 
																 #ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+        int worker = -1;
															
 
																 	while(workers->has_next(workers, &it))
															
 
																 	{
															
 
																 		worker = workers->get_next(workers, &it);
															
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -33,6 +33,10 @@
 
																 #define DBL_MAX __DBL_MAX__
															
 
																 #endif
															
 
																+/* if no priority is set when creating the scheduling context, we use the following ones */
															
 
																+#define DEFAULT_MIN_PRIORITY 0
															
 
																+#define DEFAULT_MAX_PRIORITY 1
															
 
																+
															
 
																 //static unsigned ncombinedworkers;
															
 
																 //static enum starpu_perfmodel_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
															
 
																 //static unsigned napplicable_perf_archtypes = 0;
															
@@ -552,6 +556,14 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
																 	hd->_gamma = _STARPU_SCHED_GAMMA_DEFAULT;
															
 
																 	hd->idle_power = 0.0;
															
 
																+	if (starpu_sched_ctx_min_priority_is_set(sched_ctx_id) == 0)
															
 
																+		starpu_sched_ctx_set_min_priority(sched_ctx_id, DEFAULT_MIN_PRIORITY);
															
 
																+	if (starpu_sched_ctx_max_priority_is_set(sched_ctx_id) == 0)
															
 
																+		starpu_sched_ctx_set_max_priority(sched_ctx_id, DEFAULT_MAX_PRIORITY);
															
 
																+	STARPU_ASSERT_MSG(starpu_sched_ctx_get_min_priority(sched_ctx_id) < starpu_sched_ctx_get_max_priority(sched_ctx_id),
															
 
																+			  "Priority min %d should be lower than priority max %d\n",
															
 
																+			  starpu_sched_ctx_get_min_priority(sched_ctx_id), starpu_sched_ctx_get_max_priority(sched_ctx_id));
															
 
																+
															
 
																 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)hd);
															
 
																 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
															
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -332,6 +332,7 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
																 	return task;
															
 
																 }
															
 
																+static
															
 
																 int ws_push_task(struct starpu_task *task)
															
 
																 {
															
 
																 	unsigned sched_ctx_id = task->sched_ctx;
															
--- a/src/util/starpu_task_insert.c
+++ b/src/util/starpu_task_insert.c
@@ -64,6 +64,7 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 
																 	va_end(varg_list);
															
 
																 }
															
 
																+static
															
 
																 int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
															
 
																 {
															
 
																 	void *arg_buffer = NULL;
															
--- a/src/util/starpu_task_list_inline.h
+++ b/src/util/starpu_task_list_inline.h
@@ -1,158 +0,0 @@
 
																-/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																- *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																- *
															
 
																- * StarPU is free software; you can redistribute it and/or modify
															
 
																- * it under the terms of the GNU Lesser General Public License as published by
															
 
																- * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																- * your option) any later version.
															
 
																- *
															
 
																- * StarPU is distributed in the hope that it will be useful, but
															
 
																- * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																- *
															
 
																- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																- */
															
 
																-
															
 
																-#ifndef __STARPU_TASK_LIST_INLINE_H
															
 
																-#define __STARPU_TASK_LIST_INLINE_H
															
 
																-
															
 
																-#include <starpu_task.h>
															
 
																-
															
 
																-#ifndef STARPU_INLINE
															
 
																-#ifdef __GNUC_GNU_INLINE__
															
 
																-#define STARPU_INLINE extern inline
															
 
																-#else
															
 
																-#define STARPU_INLINE static inline
															
 
																-#endif
															
 
																-#endif
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-void starpu_task_list_init(struct starpu_task_list *list)
															
 
																-{
															
 
																-	list->head = NULL;
															
 
																-	list->tail = NULL;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-void starpu_task_list_push_front(struct starpu_task_list *list,
															
 
																-				struct starpu_task *task)
															
 
																-{
															
 
																-	if (list->tail == NULL)
															
 
																-	{
															
 
																-		list->tail = task;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		list->head->prev = task;
															
 
																-	}
															
 
																-
															
 
																-	task->prev = NULL;
															
 
																-	task->next = list->head;
															
 
																-	list->head = task;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-void starpu_task_list_push_back(struct starpu_task_list *list,
															
 
																-				struct starpu_task *task)
															
 
																-{
															
 
																-	if (list->head == NULL)
															
 
																-	{
															
 
																-		list->head = task;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		list->tail->next = task;
															
 
																-	}
															
 
																-
															
 
																-	task->next = NULL;
															
 
																-	task->prev = list->tail;
															
 
																-	list->tail = task;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
															
 
																-{
															
 
																-	return list->head;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
															
 
																-{
															
 
																-	return list->tail;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-int starpu_task_list_empty(struct starpu_task_list *list)
															
 
																-{
															
 
																-	return (list->head == NULL);
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-void starpu_task_list_erase(struct starpu_task_list *list,
															
 
																-				struct starpu_task *task)
															
 
																-{
															
 
																-	struct starpu_task *p = task->prev;
															
 
																-
															
 
																-	if (p)
															
 
																-	{
															
 
																-		p->next = task->next;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		list->head = task->next;
															
 
																-	}
															
 
																-
															
 
																-	if (task->next)
															
 
																-	{
															
 
																-		task->next->prev = p;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		list->tail = p;
															
 
																-	}
															
 
																-
															
 
																-	task->prev = NULL;
															
 
																-	task->next = NULL;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
															
 
																-{
															
 
																-	struct starpu_task *task = list->head;
															
 
																-
															
 
																-	if (task)
															
 
																-		starpu_task_list_erase(list, task);
															
 
																-
															
 
																-	return task;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
															
 
																-{
															
 
																-	struct starpu_task *task = list->tail;
															
 
																-
															
 
																-	if (task)
															
 
																-		starpu_task_list_erase(list, task);
															
 
																-
															
 
																-	return task;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
															
 
																-{
															
 
																-	return list->head;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	return NULL;
															
 
																-}
															
 
																-
															
 
																-STARPU_INLINE
															
 
																-struct starpu_task *starpu_task_list_next(struct starpu_task *task)
															
 
																-{
															
 
																-	return task->next;
															
 
																-}
															
 
																-#endif /* __STARPU_TASK_LIST_INLINE_H */
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -23,6 +23,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 
																 EXTRA_DIST =					\
															
 
																 	helper.h				\
															
 
																 	datawizard/scal.h			\
															
 
																+	datawizard/mpi_like.h			\
															
 
																 	microbenchs/tasks_size_overhead.sh	\
															
 
																 	microbenchs/tasks_size_overhead.gp	\
															
 
																 	datawizard/scratch_opencl_kernel.cl     \
															
@@ -61,7 +62,7 @@ if STARPU_USE_CUDA
 
																 # TODO define NVCCFLAGS
															
 
																 NVCC ?= nvcc
															
 
																-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include $(HWLOC_CFLAGS) $(SIMGRID_CFLAGS)
															
 
																+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/src/common -I$(top_builddir)/include $(HWLOC_CFLAGS) $(SIMGRID_CFLAGS)
															
 
																 .cu.cubin:
															
 
																 	$(MKDIR_P) `dirname $@`
															
--- a/tests/datawizard/acquire_cb.c
+++ b/tests/datawizard/acquire_cb.c
@@ -20,6 +20,7 @@
 
																 unsigned token = 0;
															
 
																 starpu_data_handle_t token_handle;
															
 
																+static
															
 
																 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	token = 42;
															
--- a/tests/datawizard/acquire_cb_insert.c
+++ b/tests/datawizard/acquire_cb_insert.c
@@ -58,6 +58,7 @@ struct starpu_codelet work =
 
																 static int x;
															
 
																 static starpu_data_handle_t x_handle, f_handle;
															
 
																+static
															
 
																 void callback(void *arg)
															
 
																 {
															
 
																 	starpu_task_insert(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
															
--- a/tests/datawizard/acquire_release.c
+++ b/tests/datawizard/acquire_release.c
@@ -55,6 +55,7 @@ static struct starpu_codelet increment_cl =
 
																 unsigned token = 0;
															
 
																 starpu_data_handle_t token_handle;
															
 
																+static
															
 
																 int increment_token(void)
															
 
																 {
															
 
																 	int ret;
															
@@ -66,6 +67,7 @@ int increment_token(void)
 
																 	return ret;
															
 
																 }
															
 
																+static
															
 
																 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																         starpu_data_release(token_handle);
															
--- a/tests/datawizard/acquire_release2.c
+++ b/tests/datawizard/acquire_release2.c
@@ -55,6 +55,7 @@ static struct starpu_codelet increment_cl =
 
																 unsigned token = 0;
															
 
																 starpu_data_handle_t token_handle;
															
 
																+static
															
 
																 int increment_token(int synchronous)
															
 
																 {
															
 
																 	struct starpu_task *task = starpu_task_create();
															
@@ -64,6 +65,7 @@ int increment_token(int synchronous)
 
																 	return starpu_task_submit(task);
															
 
																 }
															
 
																+static
															
 
																 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																         starpu_data_release(token_handle);
															
--- a/tests/datawizard/allocate.c
+++ b/tests/datawizard/allocate.c
@@ -31,6 +31,7 @@ int main(int argc, char **argv)
 
																 }
															
 
																 #else
															
 
																+static
															
 
																 int test_prefetch(unsigned memnodes)
															
 
																 {
															
 
																 	int ret;
															
@@ -114,14 +115,13 @@ int test_prefetch(unsigned memnodes)
 
																 	{
															
 
																 		available_size = starpu_memory_get_available(i);
															
 
																 		FPRINTF(stderr, "Available memory size on node %u: %ld\n", i, available_size);
															
 
																-#ifndef STARPU_USE_ALLOCATION_CACHE
															
 
																-		STARPU_CHECK_RETURN_VALUE_IS((int)available_size, SIZE_ALLOC*1024*1024, "starpu_memory_get_available (node %u)", i);
															
 
																-#endif
															
 
																+		/* STARPU_CHECK_RETURN_VALUE_IS((int)available_size, SIZE_ALLOC*1024*1024, "starpu_memory_get_available (node %u)", i); */
															
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
 
																+static
															
 
																 void test_malloc()
															
 
																 {
															
 
																 	int ret;
															
--- a/tests/datawizard/commute.c
+++ b/tests/datawizard/commute.c
@@ -32,8 +32,6 @@ static struct starpu_codelet codelet_begin =
 
																 	.nbuffers = 1,
															
 
																 };
															
 
																-
															
 
																-
															
 
																 void commute1(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
@@ -49,8 +47,6 @@ static struct starpu_codelet codelet_commute1 =
 
																 	.modes = {STARPU_RW | STARPU_COMMUTE}
															
 
																 };
															
 
																-
															
 
																-
															
 
																 void commute2(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
@@ -78,8 +74,6 @@ static struct starpu_codelet codelet_commute3 =
 
																 	.modes = {STARPU_RW | STARPU_COMMUTE}
															
 
																 };
															
 
																-
															
 
																-
															
 
																 static struct starpu_codelet codelet_end;
															
 
																 void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
@@ -107,7 +101,7 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 
																 	int ret;
															
 
																 	codelet_begin.modes[0] = begin_mode;
															
 
																-	codelet_end.modes[0] = end_mode;	
															
 
																+	codelet_end.modes[0] = end_mode;
															
 
																 	begin_t = starpu_task_create();
															
 
																 	begin_t->cl = &codelet_begin;
															
--- a/tests/datawizard/critical_section_with_void_interface.c
+++ b/tests/datawizard/critical_section_with_void_interface.c
@@ -27,7 +27,7 @@ starpu_data_handle_t void_handle;
 
																 int critical_var;
															
 
																-static void critical_section(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																+void critical_section(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																 {
															
 
																 	/* We do not protect this variable because it is only accessed when the
															
 
																 	 * "void_handle" piece of data is accessed. */
															
--- a/tests/datawizard/cuda_codelet_unsigned_inc.cu
+++ b/tests/datawizard/cuda_codelet_unsigned_inc.cu
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -16,6 +16,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include "./mpi_like.h"
															
 
																 static __global__ void _cuda_unsigned_inc(unsigned *val)
															
 
																 {
															
--- a/tests/datawizard/data_implicit_deps.c
+++ b/tests/datawizard/data_implicit_deps.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -38,6 +38,7 @@ static struct starpu_codelet eating_cl =
 
																 	.nbuffers = 2
															
 
																 };
															
 
																+static
															
 
																 int submit_one_task(unsigned p)
															
 
																 {
															
 
																 	struct starpu_task *task = starpu_task_create();
															
--- a/tests/datawizard/increment_redux.c
+++ b/tests/datawizard/increment_redux.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -98,8 +98,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
																 }
															
 
																 #endif
															
 
																-
															
 
																-
															
 
																 void redux_cpu_kernel(void *descr[], void *arg)
															
 
																 {
															
 
																 	STARPU_SKIP_IF_VALGRIND;
															
--- a/tests/datawizard/increment_redux_lazy.c
+++ b/tests/datawizard/increment_redux_lazy.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -88,8 +88,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
																 }
															
 
																 #endif
															
 
																-
															
 
																-
															
 
																 void redux_cpu_kernel(void *descr[], void *arg)
															
 
																 {
															
 
																 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
--- a/tests/datawizard/increment_redux_v2.c
+++ b/tests/datawizard/increment_redux_v2.c
@@ -97,8 +97,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
																 }
															
 
																 #endif
															
 
																-
															
 
																-
															
 
																 void redux_cpu_kernel(void *descr[], void *arg)
															
 
																 {
															
 
																 	STARPU_SKIP_IF_VALGRIND;
															
--- a/tests/datawizard/interfaces/block/block_interface.c
+++ b/tests/datawizard/interfaces/block/block_interface.c
@@ -33,7 +33,6 @@ extern void test_block_cuda_func(void *buffers[], void *_args);
 
																 extern void test_block_opencl_func(void *buffers[], void *args);
															
 
																 #endif
															
 
																-
															
 
																 static starpu_data_handle_t _block_handle;
															
 
																 static starpu_data_handle_t _block2_handle;
															
--- a/tests/datawizard/interfaces/multiformat/advanced/generic.c
+++ b/tests/datawizard/interfaces/multiformat/advanced/generic.c
@@ -73,6 +73,7 @@ void opencl_func(void *buffers[], void *args)
 
																 	global_stats.opencl++;
															
 
																 }
															
 
																+static
															
 
																 void cpu_to_opencl_func(void *buffers[], void *args)
															
 
																 {
															
 
																 	STARPU_SKIP_IF_VALGRIND;
															
@@ -80,6 +81,7 @@ void cpu_to_opencl_func(void *buffers[], void *args)
 
																 	global_stats.cpu_to_opencl++;
															
 
																 }
															
 
																+static
															
 
																 void opencl_to_cpu_func(void *buffers[], void *args)
															
 
																 {
															
 
																 	STARPU_SKIP_IF_VALGRIND;
															
--- a/tests/datawizard/lazy_unregister.c
+++ b/tests/datawizard/lazy_unregister.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -21,7 +21,7 @@
 
																 #include "../helper.h"
															
 
																-static void dummy_func(void ** buffers, void * args)
															
 
																+void dummy_func(void ** buffers, void * args)
															
 
																 {
															
 
																 	(void) buffers;
															
 
																 	(void) args;
															
--- a/tests/datawizard/manual_reduction.c
+++ b/tests/datawizard/manual_reduction.c
@@ -75,7 +75,7 @@ static void initialize_per_worker_handle(void *arg STARPU_ATTRIBUTE_UNUSED)
 
																  *	Implement reduction method
															
 
																  */
															
 
																-static void cpu_redux_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																+void cpu_redux_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	unsigned *a = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	unsigned *b = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
@@ -98,7 +98,7 @@ static struct starpu_codelet reduction_codelet =
 
																  *	Use per-worker local copy
															
 
																  */
															
 
																-static void cpu_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																+void cpu_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	*val = *val + 1;
															
--- a/tests/datawizard/mpi_like.c
+++ b/tests/datawizard/mpi_like.c
@@ -18,8 +18,9 @@
 
																 #include <config.h>
															
 
																 #include <starpu.h>
															
 
																 #include <errno.h>
															
 
																-#include "../helper.h"
															
 
																 #include <common/thread.h>
															
 
																+#include "../helper.h"
															
 
																+#include "./mpi_like.h"
															
 
																 #define NTHREADS	4
															
 
																 #define NITER		2
															
@@ -46,12 +47,6 @@ static struct thread_data problem_data[NTHREADS];
 
																 /* We implement some ring transfer, every thread will try to receive a piece of
															
 
																  * data from its neighbour and increment it before transmitting it to its
															
 
																  * successor. */
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-void opencl_codelet_unsigned_inc(void *buffers[], void *args);
															
 
																-#endif
															
 
																 void increment_handle_cpu_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
@@ -73,7 +68,8 @@ static struct starpu_codelet increment_handle_cl =
 
																 	.nbuffers = 1
															
 
																 };
															
 
																-static void increment_handle(struct thread_data *thread_data)
															
 
																+static
															
 
																+void increment_handle(struct thread_data *thread_data)
															
 
																 {
															
 
																 	struct starpu_task *task = starpu_task_create();
															
 
																 	task->cl = &increment_handle_cl;
															
--- a/src/util/starpu_inlines.c
+++ b/src/util/starpu_inlines.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -14,5 +15,18 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																-#define STARPU_INLINE
															
 
																+#include <config.h>
															
 
																 #include <starpu.h>
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#ifdef __CUDACC__
															
 
																+extern "C"
															
 
																+#endif
															
 
																+void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+void opencl_codelet_unsigned_inc(void *buffers[], void *args);
															
 
																+#endif
															
 
																+
															
 
																+
															
--- a/tests/datawizard/readers_and_writers.c
+++ b/tests/datawizard/readers_and_writers.c