лет назад: 13 · a007c4d009
--- a/ChangeLog
+++ b/ChangeLog
@@ -116,6 +116,9 @@ Small features:
 
				   * File STARPU-REVISION --- containing the SVN revision number from which
			
 
				     StarPU was compiled --- is installed in the share/doc/starpu directory
			
 
				   * starpu_perfmodel_plot can now directly draw GFlops curves.
			
 
				+  * New configure option --enable-mpi-progression-hook to enable the
			
 
				+    activity polling method for StarPU-MPI.
			
 
				+  * Permit to disable sequential consistency for a given task.
			
 
				 
			
 
				 Changes:
			
 
				   * Fix the block filter functions.
			
--- a/configure.ac
+++ b/configure.ac
@@ -221,6 +221,7 @@ AM_CONDITIONAL([STARPU_LONG_CHECK], [test "x$enable_long_check" = "xyes"])
 
				 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
			
 
				 
			
 
				 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
			
 
				+AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
			
 
				 
			
 
				 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
			
 
				 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
			
@@ -1234,6 +1235,13 @@ if test x$use_mpi = xyes; then
 
				 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
			
 
				 fi
			
 
				 
			
 
				+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
			
 
				+				   [Enable StarPU MPI activity polling method])],
			
 
				+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
			
 
				+if  test x$enable_mpi_progression_hook = xyes; then
			
 
				+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
			
 
				+fi
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                               StarPU-Top                                    #
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1805,9 +1805,15 @@ contained in the @code{tag_id} field. Tag allow the application to synchronize
 
				 with the task and to express task dependencies easily.
			
 
				 
			
 
				 @item @code{starpu_tag_t tag_id}
			
 
				-This fields contains the tag associated to the task if the @code{use_tag} field
			
 
				+This field contains the tag associated to the task if the @code{use_tag} field
			
 
				 was set, it is ignored otherwise.
			
 
				 
			
 
				+@item @code{unsigned sequential_consistency}
			
 
				+If this flag is set (which is the default), sequential consistency is enforced
			
 
				+for the data parameters of this task for which sequential consistency is
			
 
				+enabled. Clearing this flag permits to disable sequential consistency for this
			
 
				+task, even if data have it enabled.
			
 
				+
			
 
				 @item @code{unsigned synchronous}
			
 
				 If this flag is set, the @code{starpu_task_submit} function is blocking and
			
 
				 returns only when the task has been executed (or if no worker is able to
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -209,10 +209,14 @@ enabled when the GCC compiler provides a plug-in support.
 
				 @end defvr
			
 
				 
			
 
				 @defvr {Configure option} --with-mpicc=@var{path}
			
 
				-Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				+Use the @command{mpicc} compiler at @var{path}, for StarPU-MPI.
			
 
				 (@pxref{StarPU MPI support}).
			
 
				 @end defvr
			
 
				 
			
 
				+@defvr {Configure option} --enable-mpi-progression-hook
			
 
				+Enable the activity polling method for StarPU-MPI.
			
 
				+@end defvr
			
 
				+
			
 
				 @node Advanced configuration
			
 
				 @subsection Advanced configuration
			
 
				 
			
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -106,7 +106,7 @@ static int start_task_grid(unsigned iter)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
			
 
				+void cpu_codelet(void *descr[] __attribute__((unused)), void *_args __attribute__((unused)))
			
 
				 {
			
 
				 /*	int i = (uintptr_t) _args;
			
 
				 	printf("doing %x\n", i);
			
@@ -117,7 +117,7 @@ void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
 
				 
			
 
				 int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
			
 
				 {
			
 
				-	unsigned i;
			
 
				+	unsigned i, j;
			
 
				 	int ret;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
@@ -161,8 +161,9 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 	FPRINTF(stderr, "TEST DONE ...\n");
			
 
				 
			
 
				 enodev:
			
 
				-	for (i = 0; i < Nrolls; i++)
			
 
				-	{
			
 
				+	for (i = 0; i < Nrolls; i++) {
			
 
				+		for (j = 0; j < ni; j++)
			
 
				+			starpu_task_destroy(tasks[i][j]);
			
 
				 		free(tasks[i]);
			
 
				 	}
			
 
				 
			
--- a/include/starpu_profiling.h
+++ b/include/starpu_profiling.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -101,8 +101,15 @@ int starpu_profiling_status_set(int status);
 
				  * error. */
			
 
				 int starpu_profiling_status_get(void);
			
 
				 #ifdef BUILDING_STARPU
			
 
				+#include <common/utils.h>
			
 
				 extern int _starpu_profiling;
			
 
				-#define starpu_profiling_status_get() _starpu_profiling
			
 
				+#define starpu_profiling_status_get() ({ \
			
 
				+	int __ret; \
			
 
				+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling); \
			
 
				+	__ret = _starpu_profiling; \
			
 
				+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling); \
			
 
				+	__ret; \
			
 
				+})
			
 
				 #endif
			
 
				 
			
 
				 /* Get the profiling info associated to a worker, and reset the profiling
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -129,9 +129,14 @@ struct starpu_task
 
				 	void (*callback_func)(void *);
			
 
				 	void *callback_arg;
			
 
				 
			
 
				+	/* Whether tag_id should be considered */
			
 
				 	unsigned use_tag;
			
 
				+	/* Tag associated with this task */
			
 
				 	starpu_tag_t tag_id;
			
 
				 
			
 
				+	/* Whether we should enforce sequential consistency for this task */
			
 
				+	unsigned sequential_consistency;
			
 
				+
			
 
				 	/* options for the task execution */
			
 
				 	unsigned synchronous; /* if set, a call to push is blocking */
			
 
				 	int priority; /* STARPU_MAX_PRIO = most important; STARPU_MIN_PRIO = least important */
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -186,6 +186,14 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 
				 #define STARPU_WMB() STARPU_SYNCHRONIZE()
			
 
				 #endif
			
 
				 
			
 
				+/* This is needed in some places to make valgrind yield to another thread to be
			
 
				+ * able to progress.  */
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
			
 
				+#else
			
 
				+#define STARPU_UYIELD() ((void)0)
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -88,7 +88,7 @@ examplebin_PROGRAMS +=				\
 
				 	stencil/stencil5
			
 
				 
			
 
				 stencil_stencil5_LDADD =		\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
			
 
				 
			
 
				 starpu_mpi_EXAMPLES	+=	\
			
 
				 	stencil/stencil5
			
@@ -106,7 +106,7 @@ examplebin_PROGRAMS += 			\
 
				 mpi_lu_plu_example_float_LDADD =	\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS)
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				 mpi_lu_plu_example_float_SOURCES =	\
			
 
				 	mpi_lu/plu_example_float.c	\
			
@@ -118,7 +118,7 @@ mpi_lu_plu_example_float_SOURCES =	\
 
				 mpi_lu_plu_example_double_LDADD =	\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS)
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				 mpi_lu_plu_example_double_SOURCES =	\
			
 
				 	mpi_lu/plu_example_double.c	\
			
@@ -148,7 +148,7 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_LDADD =			\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_BLAS_LDFLAGS)
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
			
 
				 	matrix_decomposition/mpi_cholesky_distributed.c	\
			
@@ -161,7 +161,7 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_BLAS_LDFLAGS)
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				 starpu_mpi_EXAMPLES +=				\
			
 
				 	matrix_decomposition/mpi_cholesky			\
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -37,12 +37,14 @@ struct starpu_codelet stencil5_cl =
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-#  define NITER_DEF	10
			
 
				+#  define NITER_DEF	5
			
 
				+#  define X         	3
			
 
				+#  define Y         	3
			
 
				 #else
			
 
				 #  define NITER_DEF	500
			
 
				+#  define X         	20
			
 
				+#  define Y         	20
			
 
				 #endif
			
 
				-#define X         20
			
 
				-#define Y         20
			
 
				 
			
 
				 int display = 0;
			
 
				 int niter = NITER_DEF;
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -23,11 +23,7 @@
 
				 #include <starpu_profiling.h>
			
 
				 #include <starpu_mpi_stats.h>
			
 
				 #include <starpu_mpi_insert_task.h>
			
 
				-
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#  warning TODO find a better way to select the polling method (perhaps during the configuration)
			
 
				-#endif
			
 
				-//#define USE_STARPU_ACTIVITY	1
			
 
				+#include <common/config.h>
			
 
				 
			
 
				 static void _starpu_mpi_submit_new_mpi_request(void *arg);
			
 
				 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
			
@@ -643,6 +639,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 		MPI_Status status;
			
 
				 		memset(&status, 0, sizeof(MPI_Status));
			
 
				 		req->ret = MPI_Recv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &status);
			
 
				+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
			
 
				 	}
			
 
				 
			
 
				 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ || req->request_type == PROBE_REQ)
			
@@ -699,7 +696,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
 
				 
			
 
				-#ifdef USE_STARPU_ACTIVITY
			
 
				+#ifdef STARPU_MPI_ACTIVITY
			
 
				 static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
			
 
				 {
			
 
				 	unsigned may_block = 1;
			
@@ -714,7 +711,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
				 
			
 
				 	return may_block;
			
 
				 }
			
 
				-#endif
			
 
				+#endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				 static void _starpu_mpi_test_detached_requests(void)
			
 
				 {
			
@@ -885,9 +882,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		/* shall we block ? */
			
 
				 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
			
 
				 
			
 
				-#ifndef USE_STARPU_ACTIVITY
			
 
				+#ifndef STARPU_MPI_ACTIVITY
			
 
				 		block = block && _starpu_mpi_req_list_empty(detached_requests);
			
 
				-#endif
			
 
				+#endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				 		if (block)
			
 
				 		{
			
@@ -946,20 +943,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 /*                                                      */
			
 
				 /********************************************************/
			
 
				 
			
 
				-#ifdef USE_STARPU_ACTIVITY
			
 
				+#ifdef STARPU_MPI_ACTIVITY
			
 
				 static int hookid = - 1;
			
 
				-#endif
			
 
				+#endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				 static void _starpu_mpi_add_sync_point_in_fxt(void)
			
 
				 {
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	int rank;
			
 
				 	int worldsize;
			
 
				+	int ret;
			
 
				+
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				 
			
 
				-	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				-	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+	ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	/* We generate a "unique" key so that we can make sure that different
			
 
				 	 * FxT traces come from the same MPI run. */
			
@@ -973,7 +972,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
				 		random_number = rand();
			
 
				 	}
			
 
				 
			
 
				-	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
			
 
				+	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
			
 
				 
			
@@ -1006,10 +1006,10 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
				 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 
			
 
				-#ifdef USE_STARPU_ACTIVITY
			
 
				+#ifdef STARPU_MPI_ACTIVITY
			
 
				 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
			
 
				 	STARPU_ASSERT(hookid >= 0);
			
 
				-#endif
			
 
				+#endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				 	_starpu_mpi_add_sync_point_in_fxt();
			
 
				 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
			
@@ -1058,9 +1058,9 @@ int starpu_mpi_shutdown(void)
 
				 
			
 
				 	pthread_join(progress_thread, &value);
			
 
				 
			
 
				-#ifdef USE_STARPU_ACTIVITY
			
 
				+#ifdef STARPU_MPI_ACTIVITY
			
 
				 	starpu_progression_hook_deregister(hookid);
			
 
				-#endif
			
 
				+#endif /* STARPU_MPI_ACTIVITY */
			
 
				 
			
 
				 	TRACE_MPI_STOP(rank, world_size);
			
 
				 
			
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -42,26 +42,23 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
				 {
			
 
				 	int rank;
			
 
				 	int x;
			
 
				-	struct _callback_arg *callback_arg;
			
 
				-	void (*callback_func)(void *);
			
 
				+	struct _callback_arg *callback_arg = NULL;
			
 
				+	void (*callback_func)(void *) = NULL;
			
 
				+	void (*callback)(void *);
			
 
				 
			
 
				 	MPI_Comm_rank(comm, &rank);
			
 
				 
			
 
				-	callback_func = _callback_collective;
			
 
				-	callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				-	callback_arg->count = 0;
			
 
				-	callback_arg->nb = 0;
			
 
				-	callback_arg->callback = (rank == root) ? scallback : rcallback;
			
 
				-	callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				-	if (callback_arg->callback == NULL)
			
 
				+	callback = (rank == root) ? scallback : rcallback;
			
 
				+	if (callback)
			
 
				 	{
			
 
				-		free(callback_arg);
			
 
				-		callback_arg = NULL;
			
 
				-		callback_func = NULL;
			
 
				-	}
			
 
				+		callback_func = _callback_collective;
			
 
				+		callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				+		callback_arg->count = 0;
			
 
				+		callback_arg->nb = 0;
			
 
				+		callback_arg->callback = (rank == root) ? scallback : rcallback;
			
 
				+		callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				+		if (callback_arg->callback == NULL)
			
 
				 
			
 
				-	if (callback_arg)
			
 
				-	{
			
 
				 		for(x = 0; x < count ; x++)
			
 
				 		{
			
 
				 			if (data_handles[x])
			
@@ -107,29 +104,23 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
				 {
			
 
				 	int rank;
			
 
				 	int x;
			
 
				-	struct _callback_arg *callback_arg;
			
 
				-	void (*callback_func)(void *);
			
 
				+	struct _callback_arg *callback_arg = NULL;
			
 
				+	void (*callback_func)(void *) = NULL;
			
 
				+	void (*callback)(void *);
			
 
				 
			
 
				 	MPI_Comm_rank(comm, &rank);
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning TODO: callback_arg needs to be free-ed
			
 
				-#endif
			
 
				-	callback_func = _callback_collective;
			
 
				-	callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				-	callback_arg->count = 0;
			
 
				-	callback_arg->nb = 0;
			
 
				-	callback_arg->callback = (rank == root) ? scallback : rcallback;
			
 
				-	callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				-	if (callback_arg->callback == NULL)
			
 
				+	callback = (rank == root) ? scallback : rcallback;
			
 
				+	if (callback)
			
 
				 	{
			
 
				-		free(callback_arg);
			
 
				-		callback_arg = NULL;
			
 
				-		callback_func = NULL;
			
 
				-	}
			
 
				+		callback_func = _callback_collective;
			
 
				+
			
 
				+		callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				+		callback_arg->count = 0;
			
 
				+		callback_arg->nb = 0;
			
 
				+		callback_arg->callback = callback;
			
 
				+		callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				 
			
 
				-	if (callback_arg)
			
 
				-	{
			
 
				 		for(x = 0; x < count ; x++)
			
 
				 		{
			
 
				 			if (data_handles[x])
			
@@ -170,4 +161,3 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -45,6 +45,8 @@ void callback(void *arg __attribute__((unused)))
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	return 77;
			
 
				+	/*
			
 
				 	int ret, rank, size;
			
 
				 
			
 
				 	MPI_Init(NULL, NULL);
			
@@ -99,4 +101,5 @@ int main(int argc, char **argv)
 
				 	MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				+	*/
			
 
				 }
			
--- a/mpi/tests/user_defined_datatype.c
+++ b/mpi/tests/user_defined_datatype.c
@@ -24,36 +24,36 @@
 
				 #  define ELEMENTS 1000
			
 
				 #endif
			
 
				 
			
 
				-typedef void (*test_func)(starpu_data_handle_t *, int, int);
			
 
				+typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
			
 
				 
			
 
				-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank)
			
 
				+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				 	for(i=0 ; i<nb_handles ; i++)
			
 
				 	{
			
 
				 		starpu_data_set_rank(handles[i], 1);
			
 
				-		starpu_data_set_tag(handles[i], i+100);
			
 
				+		starpu_data_set_tag(handles[i], i+tag);
			
 
				 	}
			
 
				 
			
 
				 	for(i=0 ; i<nb_handles ; i++)
			
 
				 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank)
			
 
				+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 		for(i=0 ; i<nb_handles ; i++)
			
 
				-			starpu_mpi_send(handles[i], 0, i+100, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_send(handles[i], 0, i+tag, MPI_COMM_WORLD);
			
 
				 	}
			
 
				 	else if (rank == 0)
			
 
				 	{
			
 
				 		MPI_Status statuses[nb_handles];
			
 
				 		for(i=0 ; i<nb_handles ; i++)
			
 
				-			starpu_mpi_recv(handles[i], 1, i+100, MPI_COMM_WORLD, &statuses[i]);
			
 
				+			starpu_mpi_recv(handles[i], 1, i+tag, MPI_COMM_WORLD, &statuses[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -126,8 +126,8 @@ int main(int argc, char **argv)
 
				 				starpu_variable_data_register(&handle_vars[i], 0, (uintptr_t)&foo[i], sizeof(double));
			
 
				 			}
			
 
				 
			
 
				-			f(handle_vars, ELEMENTS, rank);
			
 
				-			f(handle_complex, ELEMENTS, rank);
			
 
				+			f(handle_vars, ELEMENTS, rank, ELEMENTS);
			
 
				+			f(handle_complex, ELEMENTS, rank, 4*ELEMENTS);
			
 
				 
			
 
				 			for(i=0 ; i<ELEMENTS ; i++)
			
 
				 			{
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
@@ -21,8 +21,10 @@
 
				 static struct bound_task_pool *task_pools = NULL;
			
 
				 
			
 
				 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
			
 
				-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger);
			
 
				-static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
			
 
				+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger,
			
 
				+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs);
			
 
				+static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], 
			
 
				+						     int *sched_ctxs, int *workers, struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
			
 
				 {
			
 
				 	double draft_tasks[nw][nt];
			
 
				 	double draft_w_in_s[ns][nw];
			
@@ -45,7 +47,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
				 	/* smallest possible tmax, difficult to obtain as we
			
 
				 	   compute the nr of flops and not the tasks */
			
 
				 	double possible_tmax = _lp_get_tmax(nw, workers);
			
 
				-	double smallest_tmax = possible_tmax / 2;
			
 
				+	double smallest_tmax = possible_tmax / 3;
			
 
				 	double tmax = possible_tmax * ns;
			
 
				 	double res = 1.0;
			
 
				 	unsigned has_sol = 0;
			
@@ -53,6 +55,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
				 	double old_tmax = 0.0;
			
 
				 	unsigned found_sol = 0;
			
 
				 
			
 
				+//	printf("tmin = %lf tmax = %lf \n", tmin, tmax);
			
 
				 	struct timeval start_time;
			
 
				 	struct timeval end_time;
			
 
				 	int nd = 0;
			
@@ -65,7 +68,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
				 		/* find solution and save the values in draft tables
			
 
				 		   only if there is a solution for the system we save them
			
 
				 		   in the proper table */
			
 
				-		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1);
			
 
				+		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1, tmp_task_pools, size_ctxs);
			
 
				 		if(res != 0.0)
			
 
				 		{
			
 
				 			for(w = 0; w < nw; w++)
			
@@ -129,7 +132,7 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
				 
			
 
				 	double w_in_s[ns][nw];
			
 
				 	double tasks[nw][nt];
			
 
				-	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers);
			
 
				+	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers, task_pools, 1);
			
 
				 	pthread_mutex_unlock(&mutex);
			
 
				 	/* if we did find at least one solution redistribute the resources */
			
 
				 	if(found_sol)
			
@@ -194,7 +197,6 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 
				 static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	/* count the tasks of the same type */
			
 
				-	pthread_mutex_lock(&mutex);
			
 
				 	struct bound_task_pool *tp = NULL;
			
 
				 
			
 
				 	for (tp = task_pools; tp; tp = tp->next)
			
@@ -209,20 +211,36 @@ static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 
				 			tp->n--;
			
 
				 		else
			
 
				 		{
			
 
				-			struct bound_task_pool *prev_tp = NULL;
			
 
				-			for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
			
 
				+			if(tp == task_pools)
			
 
				 			{
			
 
				-				if (prev_tp->next == tp)
			
 
				-					prev_tp->next = tp->next;
			
 
				+				struct bound_task_pool *next_tp = NULL;
			
 
				+				if(task_pools->next)
			
 
				+					next_tp = task_pools->next;
			
 
				+
			
 
				+				free(tp);
			
 
				+				tp = NULL;
			
 
				+				
			
 
				+				if(next_tp)
			
 
				+					task_pools = next_tp;
			
 
				+				
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				struct bound_task_pool *prev_tp = NULL;
			
 
				+				for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
			
 
				+				{
			
 
				+					if (prev_tp->next == tp)
			
 
				+						prev_tp->next = tp->next;
			
 
				+				}
			
 
				+				
			
 
				+				free(tp);
			
 
				+				tp = NULL;
			
 
				 			}
			
 
				-
			
 
				-			free(tp);
			
 
				 		}
			
 
				 	}
			
 
				-	pthread_mutex_unlock(&mutex);
			
 
				 }
			
 
				 
			
 
				-static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
			
 
				+static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs)
			
 
				 {
			
 
				         struct bound_task_pool *tp;
			
 
				         int w, t;
			
@@ -230,14 +248,33 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
 
				         {
			
 
				                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				                 {
			
 
				-                        enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_perf_archtype(w) :
			
 
				-				starpu_worker_get_perf_archtype(workers[w]);
			
 
				+			int worker = workers == NULL ? w : workers[w];
			
 
				+                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(worker);
			
 
				                         double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
			
 
				 
			
 
				                         if (isnan(length))
			
 
				                                 times[w][t] = NAN;
			
 
				-                       else
			
 
				+			else
			
 
				+			{
			
 
				                                 times[w][t] = length / 1000.;
			
 
				+
			
 
				+				double transfer_time = 0.0;
			
 
				+				enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				+				if(arch == STARPU_CUDA_WORKER)
			
 
				+				{
			
 
				+					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
			
 
				+					if(!worker_in_ctx && !size_ctxs)
			
 
				+					{
			
 
				+						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
			
 
				+						transfer_time +=  (tp->footprint / transfer_velocity) / 1000. ;
			
 
				+					}
			
 
				+					double latency = starpu_get_latency_RAM_CUDA(worker);
			
 
				+					transfer_time += latency/1000.;
			
 
				+
			
 
				+				}
			
 
				+//				printf("%d/%d %s x %d time = %lf transfer_time = %lf\n", w, tp->sched_ctx_id, tp->cl->model->symbol, tp->n, times[w][t], transfer_time);
			
 
				+				times[w][t] += transfer_time;
			
 
				+			}
			
 
				                 }
			
 
				         }
			
 
				 }
			
@@ -247,9 +284,10 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
 
				  */
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
 
				 #include <glpk.h>
			
 
				-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer)
			
 
				+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer,
			
 
				+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
			
 
				 {
			
 
				-	if(task_pools == NULL)
			
 
				+	if(tmp_task_pools == NULL)
			
 
				 		return 0.0;
			
 
				 	struct bound_task_pool * tp;
			
 
				 	int t, w, s;
			
@@ -270,7 +308,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 		int ia[ne], ja[ne];
			
 
				 		double ar[ne];
			
 
				 
			
 
				-		_get_tasks_times(nw, nt, times, workers);
			
 
				+		_get_tasks_times(nw, nt, times, workers, size_ctxs);
			
 
				 
			
 
				 		/* Variables: number of tasks i assigned to worker j, and tmax */
			
 
				 		glp_add_cols(lp, nw*nt+ns*nw);
			
@@ -280,7 +318,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
			
 
				 
			
 
				 		for (w = 0; w < nw; w++)
			
 
				-			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				+			for (t = 0; t < nt; t++)
			
 
				 			{
			
 
				 				char name[32];
			
 
				 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
			
@@ -313,7 +351,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 		int curr_row_idx = 0;
			
 
				 		/* Total worker execution time */
			
 
				 		glp_add_rows(lp, nw*ns);
			
 
				-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				+		for (t = 0; t < nt; t++)
			
 
				 		{
			
 
				 			int someone = 0;
			
 
				 			for (w = 0; w < nw; w++)
			
@@ -336,7 +374,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 				starpu_worker_get_name(w, name, sizeof(name));
			
 
				 				snprintf(title, sizeof(title), "worker %s", name);
			
 
				 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
			
 
				-				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				+				for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
			
 
				 				{
			
 
				 					if((int)tp->sched_ctx_id == sched_ctxs[s])
			
 
				 					{
			
@@ -362,7 +400,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 
			
 
				 		/* Total task completion */
			
 
				 		glp_add_rows(lp, nt);
			
 
				-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				+		for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
			
 
				 		{
			
 
				 			char name[32], title[64];
			
 
				 			starpu_worker_get_name(w, name, sizeof(name));
			
@@ -411,6 +449,12 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 	glp_init_smcp(&parm);
			
 
				 	parm.msg_lev = GLP_MSG_OFF;
			
 
				 	int ret = glp_simplex(lp, &parm);
			
 
				+
			
 
				+/* 	char str[50]; */
			
 
				+/* 	sprintf(str, "outpu_lp_%g", tmax); */
			
 
				+
			
 
				+/* 	glp_print_sol(lp, str); */
			
 
				+
			
 
				 	if (ret)
			
 
				 	{
			
 
				 		printf("error in simplex\n");
			
@@ -449,7 +493,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 
			
 
				 	double res = glp_get_obj_val(lp);
			
 
				 	for (w = 0; w < nw; w++)
			
 
				-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				+		for (t = 0; t < nt; t++)
			
 
				 /* 			if (integer) */
			
 
				 /* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
			
 
				 /*                         else */
			
@@ -471,10 +515,18 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				+static struct bound_task_pool* _clone_linked_list(struct bound_task_pool *tp)
			
 
				+{
			
 
				+	if(tp == NULL) return NULL;
			
 
				+
			
 
				+	struct bound_task_pool *tmp_tp = (struct bound_task_pool*)malloc(sizeof(struct bound_task_pool));
			
 
				+	memcpy(tmp_tp, tp, sizeof(struct bound_task_pool));
			
 
				+	tmp_tp->next = _clone_linked_list(tp->next);
			
 
				+	return tmp_tp;
			
 
				+}
			
 
				 
			
 
				 static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				-	_remove_task_from_pool(task, footprint);
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				 
			
 
				 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
			
@@ -491,24 +543,50 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 
				 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
			
 
				 			int nw = starpu_worker_get_count(); /* Number of different workers */
			
 
				 			int nt = 0; /* Number of different kinds of tasks */
			
 
				-			pthread_mutex_lock(&mutex);
			
 
				-			struct bound_task_pool * tp;
			
 
				+
			
 
				+//			pthread_mutex_lock(&mutex);
			
 
				+
			
 
				+			/* we don't take the mutex bc a correct value of the number of tasks is
			
 
				+			   not required but we do a copy in order to be sure
			
 
				+			   that the linear progr won't segfault if the list of 
			
 
				+			   submitted task will change during the exec */
			
 
				+
			
 
				+			struct bound_task_pool *tp = NULL;
			
 
				+			struct bound_task_pool *tmp_task_pools = _clone_linked_list(task_pools);
			
 
				+
			
 
				 			for (tp = task_pools; tp; tp = tp->next)
			
 
				 				nt++;
			
 
				 
			
 
				+
			
 
				 			double w_in_s[ns][nw];
			
 
				 			double tasks_per_worker[nw][nt];
			
 
				 
			
 
				-			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
			
 
				-			pthread_mutex_unlock(&mutex);
			
 
				+			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL, tmp_task_pools, 0);
			
 
				+//			pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				 			/* if we did find at least one solution redistribute the resources */
			
 
				 			if(found_sol)
			
 
				 				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
			
 
				 
			
 
				+			struct bound_task_pool *next = NULL;
			
 
				+			struct bound_task_pool *tmp_tp = tmp_task_pools;
			
 
				+			while(tmp_task_pools)
			
 
				+			{
			
 
				+				next = tmp_tp->next;
			
 
				+				free(tmp_tp);
			
 
				+				tmp_tp = next;
			
 
				+				tmp_task_pools = next;
			
 
				+			}
			
 
				+			
			
 
				 
			
 
				 		}
			
 
				 		pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
 
				+	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
			
 
				+//	pthread_mutex_lock(&mutex);
			
 
				+	_remove_task_from_pool(task, footprint);
			
 
				+//	pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				 }
			
 
				 
			
 
				 
			
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -82,6 +82,7 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 
				 		/* Give hand to another thread, hopefully the one which has the
			
 
				 		 * spinlock and probably just has also a short-lived mutex. */
			
 
				 		MSG_process_sleep(0.000001);
			
 
				+		STARPU_UYIELD();
			
 
				 	}
			
 
				 #elif defined(STARPU_SPINLOCK_CHECK)
			
 
				 	int ret = pthread_mutex_lock(&lock->errcheck_lock);
			
@@ -96,6 +97,8 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 
				 	do
			
 
				 	{
			
 
				 		prev = STARPU_TEST_AND_SET(&lock->taken, 1);
			
 
				+		if (prev)
			
 
				+			STARPU_UYIELD();
			
 
				 	}
			
 
				 	while (prev);
			
 
				 	return 0;
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,6 +31,54 @@
 
				 #include <msg/msg.h>
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_HAVE_HELGRIND_H
			
 
				+#include <valgrind/helgrind.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
			
 
				+#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
			
 
				+#endif
			
 
				+#ifndef VALGRIND_HG_MUTEX_LOCK_POST
			
 
				+#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
			
 
				+#endif
			
 
				+#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
			
 
				+#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
			
 
				+#endif
			
 
				+#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
			
 
				+#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
			
 
				+#endif
			
 
				+#ifndef DO_CREQ_v_WW
			
 
				+#define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
			
 
				+#endif
			
 
				+#ifndef DO_CREQ_v_W
			
 
				+#define DO_CREQ_v_W(_creqF, _ty1F, _arg1F) ((void)0)
			
 
				+#endif
			
 
				+#ifndef ANNOTATE_HAPPENS_BEFORE
			
 
				+#define ANNOTATE_HAPPENS_BEFORE(obj) ((void)0)
			
 
				+#endif
			
 
				+#ifndef ANNOTATE_HAPPENS_AFTER
			
 
				+#define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
			
 
				+#endif
			
 
				+#ifndef ANNOTATE_RWLOCK_ACQUIRED
			
 
				+#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
			
 
				+#endif
			
 
				+#ifndef ANNOTATE_RWLOCK_RELEASED
			
 
				+#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
			
 
				+#endif
			
 
				+
			
 
				+#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
			
 
				+	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
			
 
				+			struct _starpu_spinlock *, lock, long, 0)
			
 
				+#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
			
 
				+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
			
 
				+			struct _starpu_spinlock *, lock)
			
 
				+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
			
 
				+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
			
 
				+			struct _starpu_spinlock *, lock)
			
 
				+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
			
 
				+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
			
 
				+			struct _starpu_spinlock *, lock)
			
 
				+
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
			
 
				 #else
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -307,6 +307,9 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 
				 	STARPU_ASSERT(task->cl);
			
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				+	if (!task->sequential_consistency)
			
 
				+		return;
			
 
				+
			
 
				 	/* We don't want to enforce a sequential consistency for tasks that are
			
 
				 	 * not visible to the application. */
			
 
				 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -178,10 +178,8 @@ void _starpu_tag_clear(void)
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 }
			
 
				 
			
 
				-static struct _starpu_tag *gettag_struct(starpu_tag_t id)
			
 
				+static struct _starpu_tag *_gettag_struct(starpu_tag_t id)
			
 
				 {
			
 
				-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				-
			
 
				 	/* search if the tag is already declared or not */
			
 
				 	struct _starpu_tag_table *entry;
			
 
				 	struct _starpu_tag *tag;
			
@@ -212,8 +210,15 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				+	return tag;
			
 
				+}
			
 
				 
			
 
				+static struct _starpu_tag *gettag_struct(starpu_tag_t id)
			
 
				+{
			
 
				+	struct _starpu_tag *tag;
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				+	tag = _gettag_struct(id);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 	return tag;
			
 
				 }
			
 
				 
			
@@ -432,10 +437,11 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
				 		return -EDEADLK;
			
 
				 	}
			
 
				 
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				 	/* only wait the tags that are not done yet */
			
 
				 	for (i = 0, current = 0; i < ntags; i++)
			
 
				 	{
			
 
				-		struct _starpu_tag *tag = gettag_struct(id[i]);
			
 
				+		struct _starpu_tag *tag = _gettag_struct(id[i]);
			
 
				 
			
 
				 		_starpu_spin_lock(&tag->lock);
			
 
				 
			
@@ -450,6 +456,7 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
				 			current++;
			
 
				 		}
			
 
				 	}
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 
			
 
				 	if (current == 0)
			
 
				 	{
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -942,6 +942,7 @@ int starpu_perfmodel_list(FILE *output)
 
				 
			
 
				 /* This function is intended to be used by external tools that should read the
			
 
				  * performance model files */
			
 
				+/* TODO: write an clear function, to free symbol and history */
			
 
				 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	model->symbol = strdup(symbol);
			
@@ -1064,6 +1065,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 		HASH_FIND_UINT32_T(history, &key, entry);
			
 
				 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				+		/* We do not care about racing access to the mean, we only want a
			
 
				+		 * good-enough estimation, thus simulate taking the rdlock */
			
 
				+		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
			
 
				+
			
 
				 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
 
				 			exp = entry->history_entry->mean;
			
 
				 		else if (!model->benchmarking)
			
@@ -1075,6 +1080,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 			_starpu_set_calibrate_flag(1);
			
 
				 			model->benchmarking = 1;
			
 
				 		}
			
 
				+		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
			
 
				 	}
			
 
				 
			
 
				 	return exp;
			
@@ -1097,6 +1103,10 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
				 	entry = (elt == NULL) ? NULL : elt->history_entry;
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				+	/* We do not care about racing access to the mean, we only want a
			
 
				+	 * good-enough estimation, thus simulate taking the rdlock */
			
 
				+	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
			
 
				+
			
 
				 	exp = entry?entry->mean:NAN;
			
 
				 
			
 
				 	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
			
@@ -1115,6 +1125,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
				 		model->benchmarking = 1;
			
 
				 	}
			
 
				 
			
 
				+	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
			
 
				+
			
 
				 	return exp;
			
 
				 }
			
 
				 
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -490,7 +490,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
				 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
			
 
				 	unsigned nworkers = config->topology.nworkers;
			
 
				 
			
 
				-	if(nworkers_ctx > 0 && inheritor_sched_ctx_id != STARPU_NMAX_SCHED_CTXS && 
			
 
				+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
			
 
				 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
			
 
				 	{
			
 
				 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -57,6 +57,8 @@ void starpu_task_init(struct starpu_task *task)
 
				 	 * everywhere */
			
 
				 	memset(task, 0, sizeof(struct starpu_task));
			
 
				 
			
 
				+	task->sequential_consistency = 1;
			
 
				+
			
 
				 	/* Now we can initialise fields which recquire custom value */
			
 
				 #if STARPU_DEFAULT_PRIO != 0
			
 
				 	task->priority = STARPU_DEFAULT_PRIO;
			
@@ -707,7 +709,11 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
				 	if (--nsubmitted == 0)
			
 
				 	{
			
 
				 		if (!config->submitting)
			
 
				+		{
			
 
				+			ANNOTATE_HAPPENS_AFTER(&config->running);
			
 
				 			config->running = 0;
			
 
				+			ANNOTATE_HAPPENS_BEFORE(&config->running);
			
 
				+		}
			
 
				 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
			
 
				 	}
			
 
				 
			
@@ -727,7 +733,9 @@ starpu_drivers_request_termination(void)
 
				 	config->submitting = 0;
			
 
				 	if (nsubmitted == 0)
			
 
				 	{
			
 
				+		ANNOTATE_HAPPENS_AFTER(&config->running);
			
 
				 		config->running = 0;
			
 
				+		ANNOTATE_HAPPENS_BEFORE(&config->running);
			
 
				 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
			
 
				 	}
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -893,9 +893,13 @@ out:
 
				 
			
 
				 unsigned _starpu_machine_is_running(void)
			
 
				 {
			
 
				+	unsigned ret;
			
 
				 	/* running is just protected by a memory barrier */
			
 
				 	STARPU_RMB();
			
 
				-	return config.running;
			
 
				+	ANNOTATE_HAPPENS_AFTER(&config.running);
			
 
				+	ret = config.running;
			
 
				+	ANNOTATE_HAPPENS_BEFORE(&config.running);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
			
@@ -923,8 +927,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 
				 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
			
 
				 {
			
 
				 	/* set the flag which will tell workers to stop */
			
 
				+	ANNOTATE_HAPPENS_AFTER(&config.running);
			
 
				 	pconfig->running = 0;
			
 
				 	/* running is just protected by a memory barrier */
			
 
				+	ANNOTATE_HAPPENS_BEFORE(&config.running);
			
 
				 	STARPU_WMB();
			
 
				 	starpu_wake_all_blocked_workers();
			
 
				 }
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -17,6 +17,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				+#include <common/utils.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				 
			
 
				 /* requests that have not been treated at all */
			
@@ -391,8 +392,18 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
				 	struct _starpu_data_request *r;
			
 
				 	struct _starpu_data_request_list *new_data_requests;
			
 
				 
			
 
				+	/* Note: we here tell valgrind that list_empty (reading a pointer) is
			
 
				+	 * as safe as if we had the lock held */
			
 
				+	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
			
 
				+	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				 	if (_starpu_data_request_list_empty(data_requests[src_node]))
			
 
				+	{
			
 
				+		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
			
 
				+		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				 		return;
			
 
				+	}
			
 
				+	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
			
 
				+	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				 
			
 
				 	/* take all the entries from the request list */
			
 
				         _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -26,12 +26,14 @@
 
				 
			
 
				 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
			
 
				 {
			
 
				-#ifdef STARPU_SIMGRID
			
 
				 #if STARPU_DEVEL
			
 
				 #warning FIXME
			
 
				 #endif
			
 
				+#ifdef STARPU_SIMGRID
			
 
				 	MSG_process_sleep(0.000010);
			
 
				 #endif
			
 
				+	STARPU_UYIELD();
			
 
				+
			
 
				 	/* in case some other driver requested data */
			
 
				 	_starpu_handle_pending_node_data_requests(memory_node);
			
 
				 	_starpu_handle_node_data_requests(memory_node, may_alloc);
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -345,13 +345,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 	/* still valid ? */
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				+		struct _starpu_data_replicate *local;
			
 
				 		/* until an issue is found the data is assumed to be valid */
			
 
				 		unsigned isvalid = 1;
			
 
				 
			
 
				 		for (child = 0; child < root_handle->nchildren; child++)
			
 
				 		{
			
 
				 			starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
			
 
				-			struct _starpu_data_replicate *local = &child_handle->per_node[node];
			
 
				+			local = &child_handle->per_node[node];
			
 
				 
			
 
				 			if (local->state == STARPU_INVALID)
			
 
				 			{
			
@@ -359,24 +360,21 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 				isvalid = 0;
			
 
				 			}
			
 
				 
			
 
				-			if (local->allocated && local->automatically_allocated)
			
 
				-			{
			
 
				+			if (local->mc && local->allocated && local->automatically_allocated)
			
 
				 				/* free the child data copy in a lazy fashion */
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning FIXME!! this needs access to the child interface, which was freed above!
			
 
				-#endif
			
 
				-				_starpu_request_mem_chunk_removal(child_handle, node, sizes[child]);
			
 
				-			}
			
 
				+				_starpu_request_mem_chunk_removal(child_handle, local, node, sizes[child]);
			
 
				 		}
			
 
				 
			
 
				-		if (!root_handle->per_node[node].allocated)
			
 
				+		local = &root_handle->per_node[node];
			
 
				+
			
 
				+		if (!local->allocated)
			
 
				 			/* Even if we have all the bits, if we don't have the
			
 
				 			 * whole data, it's not valid */
			
 
				 			isvalid = 0;
			
 
				 
			
 
				-		if (!isvalid && root_handle->per_node[node].allocated && root_handle->per_node[node].automatically_allocated)
			
 
				+		if (!isvalid && local->mc && local->allocated && local->automatically_allocated)
			
 
				 			/* free the data copy in a lazy fashion */
			
 
				-			_starpu_request_mem_chunk_removal(root_handle, node, _starpu_data_get_size(root_handle));
			
 
				+			_starpu_request_mem_chunk_removal(root_handle, local, node, _starpu_data_get_size(root_handle));
			
 
				 
			
 
				 		/* if there was no invalid copy, the node still has a valid copy */
			
 
				 		still_valid[node] = isvalid;
			
@@ -400,6 +398,10 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
			
 
				 		_starpu_spin_unlock(&child_handle->header_lock);
			
 
				 		_starpu_spin_destroy(&child_handle->header_lock);
			
 
				+
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->busy_mutex);
			
 
				+		_STARPU_PTHREAD_COND_DESTROY(&child_handle->busy_cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->sequential_consistency_mutex);
			
 
				 	}
			
 
				 
			
 
				 	/* there is no child anymore */
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -315,7 +315,7 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 
				 	uint32_t nz = block_interface->nz;
			
 
				 	size_t elemsize = block_interface->elemsize;
			
 
				 
			
 
				-	starpu_free_on_node(node, block_interface->ptr, nx*ny*nz*elemsize);
			
 
				+	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -480,6 +480,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
			
 
				 				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
			
 
				 			}
			
 
				+			_STARPU_PTHREAD_MUTEX_DESTROY(&arg.mutex);
			
 
				+			_STARPU_PTHREAD_COND_DESTROY(&arg.cond);
			
 
				 			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
			
 
				 		}
			
 
				 
			
@@ -546,23 +548,49 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 
			
 
				 	/* Wait for all requests to finish (notably WT requests) */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
			
 
				-	while (handle->busy_count)
			
 
				+	while (1) {
			
 
				+		int busy;
			
 
				+		/* Note: we here tell valgrind that reading busy_count is as
			
 
				+		 * safe is if we had the lock held */
			
 
				+		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
			
 
				+		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
			
 
				+		busy = handle->busy_count;
			
 
				+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
			
 
				+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
			
 
				+		if (!busy)
			
 
				+			break;
			
 
				+		/* This is woken by _starpu_data_check_not_busy, always called
			
 
				+		 * after decrementing busy_count */
			
 
				 		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
			
 
				+	}
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
			
 
				 
			
 
				 	/* Wait for finished requests to release the handle */
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				+	size_t size = _starpu_data_get_size(handle);
			
 
				+
			
 
				+	_starpu_data_free_interfaces(handle);
			
 
				+
			
 
				 	/* Destroy the data now */
			
 
				 	unsigned node;
			
 
				-	size_t size = _starpu_data_get_size(handle);
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				+		struct _starpu_data_replicate *local = &handle->per_node[node];
			
 
				 		/* free the data copy in a lazy fashion */
			
 
				-		_starpu_request_mem_chunk_removal(handle, node, size);
			
 
				+		if (local->allocated && local->automatically_allocated)
			
 
				+			_starpu_request_mem_chunk_removal(handle, local, node, size);
			
 
				+	}
			
 
				+	unsigned worker;
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
			
 
				+		/* free the data copy in a lazy fashion */
			
 
				+		if (local->allocated && local->automatically_allocated)
			
 
				+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
			
 
				 	}
			
 
				 
			
 
				-	_starpu_data_free_interfaces(handle);
			
 
				 	_starpu_memory_stats_free(handle);
			
 
				 	_starpu_data_requester_list_delete(handle->req_list);
			
 
				 	_starpu_data_requester_list_delete(handle->reduction_req_list);
			
@@ -570,6 +598,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 	_starpu_spin_destroy(&handle->header_lock);
			
 
				 
			
 
				+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->busy_mutex);
			
 
				+	_STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
			
 
				+
			
 
				 	free(handle);
			
 
				 }
			
 
				 
			
@@ -604,13 +636,22 @@ static void _starpu_data_invalidate(void *data)
 
				 	{
			
 
				 		struct _starpu_data_replicate *local = &handle->per_node[node];
			
 
				 
			
 
				-		if (local->allocated && local->automatically_allocated)
			
 
				-		{
			
 
				+		if (local->mc && local->allocated && local->automatically_allocated)
			
 
				 			/* free the data copy in a lazy fashion */
			
 
				-			_starpu_request_mem_chunk_removal(handle, node, size);
			
 
				-			local->allocated = 0;
			
 
				-			local->automatically_allocated = 0;
			
 
				-		}
			
 
				+			_starpu_request_mem_chunk_removal(handle, local, node, size);
			
 
				+
			
 
				+		local->state = STARPU_INVALID;
			
 
				+	}
			
 
				+
			
 
				+	unsigned worker;
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				+	{
			
 
				+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
			
 
				+
			
 
				+		if (local->mc && local->allocated && local->automatically_allocated)
			
 
				+			/* free the data copy in a lazy fashion */
			
 
				+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
			
 
				 
			
 
				 		local->state = STARPU_INVALID;
			
 
				 	}
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -291,7 +291,7 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 
				 	uint32_t ny = matrix_interface->ny;
			
 
				 	size_t elemsize = matrix_interface->elemsize;
			
 
				 
			
 
				-	starpu_free_on_node(node, matrix_interface->ptr, nx*ny*elemsize);
			
 
				+	starpu_free_on_node(node, matrix_interface->dev_handle, nx*ny*elemsize);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -212,7 +212,7 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 
				 	uint32_t nx = vector_interface->nx;
			
 
				 	size_t elemsize = vector_interface->elemsize;
			
 
				 
			
 
				-	starpu_free_on_node(node, vector_interface->ptr, nx*elemsize);
			
 
				+	starpu_free_on_node(node, vector_interface->dev_handle, nx*elemsize);
			
 
				 }
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node,
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -251,11 +251,6 @@ static struct starpu_codelet free_pinned_cl =
 
				 
			
 
				 int starpu_free_flags(void *A, size_t dim, int flags)
			
 
				 {
			
 
				-	if (flags & STARPU_MALLOC_COUNT)
			
 
				-	{
			
 
				-		_starpu_memory_manager_deallocate_size(dim, 0);
			
 
				-	}
			
 
				-
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (flags & STARPU_MALLOC_PINNED)
			
 
				 	{
			
@@ -272,7 +267,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
				 				cudaError_t err = cudaFreeHost(A);
			
 
				 				if (STARPU_UNLIKELY(err))
			
 
				 					STARPU_CUDA_REPORT_ERROR(err);
			
 
				-				return 0;
			
 
				+				goto out;
			
 
				 #ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				 			}
			
 
				 			else
			
@@ -293,7 +288,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
				 
			
 
				 				push_res = _starpu_task_submit_internally(task);
			
 
				 				STARPU_ASSERT(push_res != -ENODEV);
			
 
				-				return 0;
			
 
				+				goto out;
			
 
				 			}
			
 
				 #endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				 #endif /* STARPU_USE_CUDA */
			
@@ -317,13 +312,20 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
				 //
			
 
				 //		push_res = starpu_task_submit(task);
			
 
				 //		STARPU_ASSERT(push_res != -ENODEV);
			
 
				-//		return 0;
			
 
				+//		goto out;
			
 
				 //	}
			
 
				 //#endif
			
 
				 	}
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 	free(A);
			
 
				+
			
 
				+out:
			
 
				+	if (flags & STARPU_MALLOC_COUNT)
			
 
				+	{
			
 
				+		_starpu_memory_manager_deallocate_size(dim, 0);
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -62,6 +62,8 @@ void _starpu_deinit_mem_chunk_lists(void)
 
				 		_starpu_mem_chunk_list_delete(mc_list[i]);
			
 
				 		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
			
 
				 		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
			
 
				+		_starpu_spin_destroy(&lru_rwlock[i]);
			
 
				+		_STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -694,59 +696,45 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
 
				  * unregister or unpartition). It puts all the memchunks that refer to the
			
 
				  * specified handle into the cache.
			
 
				  */
			
 
				-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size)
			
 
				+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size)
			
 
				 {
			
 
				-	_starpu_spin_checklocked(&handle->header_lock);
			
 
				+	struct _starpu_mem_chunk *mc = replicate->mc;
			
 
				 
			
 
				-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				+	STARPU_ASSERT(mc->data == handle);
			
 
				 
			
 
				-	/* TODO: expensive, handle should have its own list of chunks? */
			
 
				-	/* iterate over the list of memory chunks and remove the entry */
			
 
				-	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				-	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				-	     mc = next_mc)
			
 
				-	{
			
 
				-		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				+	/* Record the allocated size, so that later in memory
			
 
				+	 * reclaiming we can estimate how much memory we free
			
 
				+	 * by freeing this.  */
			
 
				+	mc->size = size;
			
 
				 
			
 
				-		if (mc->data == handle)
			
 
				-		{
			
 
				-			/* we found the data */
			
 
				+	/* This memchunk doesn't have to do with the data any more. */
			
 
				+	replicate->mc = NULL;
			
 
				+	replicate->allocated = 0;
			
 
				+	replicate->automatically_allocated = 0;
			
 
				 
			
 
				-			/* Record the allocated size, so that later in memory
			
 
				-			 * reclaiming we can estimate how much memory we free
			
 
				-			 * by freeing this.  */
			
 
				-			mc->size = size;
			
 
				-			/* This memchunk doesn't have to do with the data any more. */
			
 
				-			mc->data = NULL;
			
 
				+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				-			/* remove it from the main list */
			
 
				-			_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				+	mc->data = NULL;
			
 
				+	/* remove it from the main list */
			
 
				+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
			
 
				 
			
 
				-			/* We would never flush the node 0 cache, unless
			
 
				-			 * malloc() returns NULL, which is very unlikely... */
			
 
				-			/* This is particularly important when
			
 
				-			 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
			
 
				-			 * wouldn't even re-use these allocations! */
			
 
				-			if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
			
 
				-			{
			
 
				-				free_memory_on_node(mc, node);
			
 
				+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				-				free(mc->chunk_interface);
			
 
				-				_starpu_mem_chunk_delete(mc);
			
 
				-			}
			
 
				-			else
			
 
				-				/* put it in the list of buffers to be removed */
			
 
				-				_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				+	/* We would never flush the node 0 cache, unless
			
 
				+	 * malloc() returns NULL, which is very unlikely... */
			
 
				+	/* This is particularly important when
			
 
				+	 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
			
 
				+	 * wouldn't even re-use these allocations! */
			
 
				+	if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
			
 
				+	{
			
 
				+		free_memory_on_node(mc, node);
			
 
				 
			
 
				-			/* Note that we do not stop here because there can be
			
 
				-			 * multiple replicates associated to the same handle on
			
 
				-			 * the same memory node.  */
			
 
				-		}
			
 
				+		free(mc->chunk_interface);
			
 
				+		_starpu_mem_chunk_delete(mc);
			
 
				 	}
			
 
				-
			
 
				-	/* there was no corresponding buffer ... */
			
 
				-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+	else
			
 
				+		/* put it in the list of buffers to be removed */
			
 
				+		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -62,7 +62,7 @@ LIST_TYPE(_starpu_mem_chunk_lru,
 
				 
			
 
				 void _starpu_init_mem_chunk_lists(void);
			
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size);
			
 
				+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
			
 
				 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -55,6 +55,7 @@ void _starpu_memory_nodes_deinit(void)
 
				 	_starpu_deinit_data_request_lists();
			
 
				 	_starpu_deinit_mem_chunk_lists();
			
 
				 
			
 
				+	_STARPU_PTHREAD_RWLOCK_DESTROY(&descr.conditions_rwlock);
			
 
				 	_STARPU_PTHREAD_KEY_DELETE(memory_node_key);
			
 
				 }
			
 
				 
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -239,11 +239,12 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
				 		.handle = handle,
			
 
				 		.mode = mode,
			
 
				 		.node = node,
			
 
				-		.cond = _STARPU_PTHREAD_COND_INITIALIZER,
			
 
				-		.lock = _STARPU_PTHREAD_MUTEX_INITIALIZER,
			
 
				 		.finished = 0
			
 
				 	};
			
 
				 
			
 
				+	_STARPU_PTHREAD_COND_INIT(&wrapper.cond, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper.lock, NULL);
			
 
				+
			
 
				 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 	int sequential_consistency = handle->sequential_consistency;
			
@@ -297,8 +298,9 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
				 		while (!wrapper.finished)
			
 
				 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
			
 
				-		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
			
 
				 	}
			
 
				+	_STARPU_PTHREAD_COND_DESTROY(&wrapper.cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
			
 
				 
			
 
				 	/* At that moment, the caller holds a reference to the piece of data.
			
 
				 	 * We enqueue the "post" sync task in the list associated to the handle
			
@@ -381,6 +383,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 		/* we can immediately proceed */
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				 
			
 
				+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
			
 
				 		free(wrapper);
			
 
				 
			
 
				 		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
			
@@ -410,6 +414,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 		while (!wrapper->finished)
			
 
				 			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
			
 
				+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
			
 
				 		free(wrapper);
			
 
				 	}
			
 
				 
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -86,7 +86,7 @@ static unsigned get_colour_symbol_blue(char *name)
 
				 }
			
 
				 
			
 
				 static double last_codelet_start[STARPU_NMAXWORKERS];
			
 
				-static char last_codelet_symbol[128][STARPU_NMAXWORKERS];
			
 
				+static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
			
 
				 
			
 
				 /* If more than a period of time has elapsed, we flush the profiling info,
			
 
				  * otherwise they are accumulated everytime there is a new relevant event. */
			
@@ -144,6 +144,8 @@ static void register_worker_id(unsigned long tid, int workerid)
 
				 
			
 
				 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
			
 
				 
			
 
				+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase the maximum number of CPUs and GPUs to the same value as was used for execution");
			
 
				+
			
 
				 	/* only register a thread once */
			
 
				 	STARPU_ASSERT(entry == NULL);
			
 
				 
			
@@ -506,7 +508,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
				 	unsigned long has_name = ev->param[3];
			
 
				 	char *name = has_name?(char *)&ev->param[4]:"unknown";
			
 
				 
			
 
				-	snprintf(last_codelet_symbol[worker], 128, "%s", name);
			
 
				+	snprintf(last_codelet_symbol[worker], sizeof(last_codelet_symbol[worker]), "%s", name);
			
 
				 
			
 
				 	double start_codelet_time = get_event_time_stamp(ev, options);
			
 
				 	last_codelet_start[worker] = start_codelet_time;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -192,7 +192,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
				 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
			
 
				 {
			
 
				 	size_t global_mem;
			
 
				-	int limit;
			
 
				+	ssize_t limit;
			
 
				 
			
 
				 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
			
 
				 #ifdef STARPU_DEVEL
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -73,7 +73,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				  */
			
 
				 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
			
 
				 {
			
 
				-	int limit;
			
 
				+	ssize_t limit;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
			
 
				 	char name[30];
			
@@ -101,8 +101,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
				 	props[devid].totalGlobalMem -= to_waste;
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				-	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
			
 
				-			devid, (long) to_waste/(1024*1024), limit, (long) totalGlobalMem/(1024*1024),
			
 
				+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			
 
				+			devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
			
 
				 			(long) (totalGlobalMem - to_waste)/(1024*1024));
			
 
				 }
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -174,11 +174,12 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 
			
 
				 		if (_starpu_worker_can_block(memnode))
			
 
				 			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				 		else
			
 
				 		{
			
 
				 			if (_starpu_machine_is_running())
			
 
				 			{
			
 
				+				STARPU_UYIELD();
			
 
				+#ifdef STARPU_SIMGRID
			
 
				 				static int warned;
			
 
				 				if (!warned)
			
 
				 				{
			
@@ -186,9 +187,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 					_STARPU_DISP("Has to make simgrid spin for progression hooks\n");
			
 
				 				}
			
 
				 				MSG_process_sleep(0.000010);
			
 
				+#endif
			
 
				 			}
			
 
				 		}
			
 
				-#endif
			
 
				 
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -61,7 +61,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
				 
			
 
				 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
			
 
				 {
			
 
				-	int limit;
			
 
				+	ssize_t limit;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
			
 
				 	char name[30];
			
@@ -90,9 +90,9 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 
				 	to_waste = totalGlobalMem - global_mem[devid];
			
 
				 #endif
			
 
				 
			
 
				-	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
			
 
				-                      devid, (size_t)to_waste/(1024*1024), limit, (size_t)totalGlobalMem/(1024*1024),
			
 
				-                      (size_t)(totalGlobalMem - to_waste)/(1024*1024));
			
 
				+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			
 
				+			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
			
 
				+			(long)(totalGlobalMem - to_waste)/(1024*1024));
			
 
				 
			
 
				 }
			
 
				 
			
@@ -701,11 +701,19 @@ int _starpu_opencl_driver_deinit(struct starpu_driver *d)
 
				 	unsigned memnode = args->memory_node;
			
 
				 
			
 
				 	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+
			
 
				+	/* In case there remains some memory that was automatically
			
 
				+	 * allocated by StarPU, we release it now. Note that data
			
 
				+	 * coherency is not maintained anymore at that point ! */
			
 
				+	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				+
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	unsigned devid   = args->devid;
			
 
				         _starpu_opencl_deinit_context(devid);
			
 
				 #endif
			
 
				 
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -68,8 +68,10 @@ int _starpu_profiling =
 
				 
			
 
				 int starpu_profiling_status_set(int status)
			
 
				 {
			
 
				+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
			
 
				 	int prev_value = _starpu_profiling;
			
 
				 	_starpu_profiling = status;
			
 
				+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
			
 
				 
			
 
				 	_STARPU_TRACE_SET_PROFILING(status);
			
 
				 
			
@@ -94,12 +96,6 @@ int starpu_profiling_status_set(int status)
 
				 	return prev_value;
			
 
				 }
			
 
				 
			
 
				-#undef starpu_profiling_status_get
			
 
				-int starpu_profiling_status_get(void)
			
 
				-{
			
 
				-	return _starpu_profiling;
			
 
				-}
			
 
				-
			
 
				 void _starpu_profiling_init(void)
			
 
				 {
			
 
				 	int worker;
			
@@ -110,7 +106,11 @@ void _starpu_profiling_init(void)
 
				 		_starpu_worker_reset_profiling_info(worker);
			
 
				 	}
			
 
				 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
			
 
				+	{
			
 
				+		ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
			
 
				 		_starpu_profiling = STARPU_PROFILING_ENABLE;
			
 
				+		ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_profiling_terminate(void)
			
@@ -127,7 +127,7 @@ struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(str
 
				 	struct starpu_task_profiling_info *info = NULL;
			
 
				 
			
 
				 	/* If we are benchmarking, we need room for the power consumption */
			
 
				-	if (_starpu_profiling || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
			
 
				+	if (starpu_profiling_status_get() || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
			
 
				 	{
			
 
				 		info = (struct starpu_task_profiling_info *) calloc(1, sizeof(struct starpu_task_profiling_info));
			
 
				 		STARPU_ASSERT(info);
			
@@ -191,7 +191,7 @@ void _starpu_worker_reset_profiling_info(int workerid)
 
				 
			
 
				 void _starpu_worker_restart_sleeping(int workerid)
			
 
				 {
			
 
				-	if (_starpu_profiling)
			
 
				+	if (starpu_profiling_status_get())
			
 
				 	{
			
 
				 		struct timespec sleep_start_time;
			
 
				 		_starpu_clock_gettime(&sleep_start_time);
			
@@ -205,7 +205,7 @@ void _starpu_worker_restart_sleeping(int workerid)
 
				 
			
 
				 void _starpu_worker_stop_sleeping(int workerid)
			
 
				 {
			
 
				-	if (_starpu_profiling)
			
 
				+	if (starpu_profiling_status_get())
			
 
				 	{
			
 
				 		struct timespec *sleeping_start, sleep_end_time;
			
 
				 
			
@@ -240,7 +240,7 @@ void _starpu_worker_stop_sleeping(int workerid)
 
				 
			
 
				 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start)
			
 
				 {
			
 
				-	if (_starpu_profiling)
			
 
				+	if (starpu_profiling_status_get())
			
 
				 	{
			
 
				 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
			
 
				 		worker_registered_executing_start[workerid] = 1;
			
@@ -252,7 +252,7 @@ void _starpu_worker_register_executing_start_date(int workerid, struct timespec
 
				 
			
 
				 void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
			
 
				 {
			
 
				-	if (_starpu_profiling)
			
 
				+	if (starpu_profiling_status_get())
			
 
				 	{
			
 
				 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
			
 
				 
			
@@ -272,7 +272,7 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
				 
			
 
				 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
			
 
				 {
			
 
				-	if (!_starpu_profiling)
			
 
				+	if (!starpu_profiling_status_get())
			
 
				 	{
			
 
				 		/* Not thread safe, shouldn't be too much a problem */
			
 
				 		info->executed_tasks = worker_info[workerid].executed_tasks;
			
@@ -319,7 +319,7 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 
				 /* When did the task reach the scheduler  ? */
			
 
				 void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
			
 
				 {
			
 
				-	if (!_starpu_profiling)
			
 
				+	if (!starpu_profiling_status_get())
			
 
				 		return;
			
 
				 
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
@@ -331,7 +331,7 @@ void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 
				 
			
 
				 void _starpu_profiling_set_task_push_end_time(struct starpu_task *task)
			
 
				 {
			
 
				-	if (!_starpu_profiling)
			
 
				+	if (!starpu_profiling_status_get())
			
 
				 		return;
			
 
				 
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
@@ -429,3 +429,13 @@ void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
 
				 	bus_profiling_info[src_node][dst_node].transfer_count++;
			
 
				 //	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
			
 
				 }
			
 
				+
			
 
				+#undef starpu_profiling_status_get
			
 
				+int starpu_profiling_status_get(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
			
 
				+	ret = _starpu_profiling;
			
 
				+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
			
 
				+	return ret;
			
 
				+}
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -103,6 +103,7 @@ noinst_PROGRAMS =				\
 
				 	main/deprecated_buffer			\
			
 
				 	main/driver_api/init_run_deinit         \
			
 
				 	main/driver_api/run_driver              \
			
 
				+	main/deploop                            \
			
 
				 	main/restart				\
			
 
				 	main/execute_on_a_specific_worker	\
			
 
				 	main/insert_task			\
			
--- a/tests/loader.c
+++ b/tests/loader.c
@@ -120,7 +120,7 @@ static void test_cleaner(int sig)
 
				 	fprintf(stderr, "[error] test %s has been blocked for %d seconds. Mark it as failed\n", test_name, timeout);
			
 
				 	child_gid = getpgid(child_pid);
			
 
				 	launch_gdb(test_name);
			
 
				-	kill(-child_gid, SIGKILL);
			
 
				+	kill(-child_gid, SIGQUIT);
			
 
				 	exit(EXIT_FAILURE);
			
 
				 }
			
 
				 
			
--- a/tests/main/deploop.c
+++ b/tests/main/deploop.c
@@ -0,0 +1,92 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Create task A and B such that
			
 
				+ * - B depends on A by tag dependency.
			
 
				+ * - A would depend on B by data dependency, but we disable that.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				+{
			
 
				+	FPRINTF(stderr,"executing task %p\n", starpu_task_get_current());
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet dummy_codelet = 
			
 
				+{
			
 
				+	.cpu_funcs = {dummy_func, NULL},
			
 
				+	.cuda_funcs = {dummy_func, NULL},
			
 
				+	.opencl_funcs = {dummy_func, NULL},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW }
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t handle;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_void_data_register(&handle);
			
 
				+
			
 
				+	struct starpu_task *taskA, *taskB;
			
 
				+
			
 
				+	/* Make B depend on A */
			
 
				+	starpu_tag_declare_deps(1, 1, (starpu_tag_t) 0);
			
 
				+
			
 
				+	taskA = starpu_task_create();
			
 
				+	taskA->cl = &dummy_codelet;
			
 
				+	taskA->tag_id = 0;
			
 
				+	taskA->use_tag = 1;
			
 
				+	taskA->handles[0] = handle;
			
 
				+	taskA->sequential_consistency = 0;
			
 
				+	FPRINTF(stderr,"A is %p\n", taskA);
			
 
				+
			
 
				+	taskB = starpu_task_create();
			
 
				+	taskB->cl = &dummy_codelet;
			
 
				+	taskB->tag_id = 1;
			
 
				+	taskB->use_tag = 1;
			
 
				+	taskB->handles[0] = handle;
			
 
				+	FPRINTF(stderr,"B is %p\n", taskB);
			
 
				+
			
 
				+	ret = starpu_task_submit(taskB);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	ret = starpu_task_submit(taskA);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tests/microbenchs/tasks_overhead.c
+++ b/tests/microbenchs/tasks_overhead.c
@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
			
 
				 	gettimeofday(&end_exec, NULL);
			
 
				 
			
 
				-	for (i = 1; i < ntasks; i++)
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				 		starpu_task_clean(&tasks[i]);
			
 
				 
			
 
				 	for (buffer = 0; buffer < nbuffers; buffer++)
			
--- a/tools/valgrind/starpu.suppr
+++ b/tools/valgrind/starpu.suppr
@@ -1,18 +1,4 @@
 
				 {
			
 
				-   config.running is not racy from starpu_shutdown
			
 
				-   Helgrind:Race
			
 
				-   fun:starpu_shutdown
			
 
				-   ...
			
 
				-}
			
 
				-
			
 
				-{
			
 
				-   config.running is not racy from _starpu_machine_is_running
			
 
				-   Helgrind:Race
			
 
				-   fun:_starpu_machine_is_running
			
 
				-   ...
			
 
				-}
			
 
				-
			
 
				-{
			
 
				    don't care about cache hit stats
			
 
				    Helgrind:Race
			
 
				    fun:_starpu_msi_cache_hit
			
@@ -48,37 +34,31 @@
 
				 }
			
 
				 
			
 
				 {
			
 
				-   We do not care about the race on the entry->mean variable, we only want a good-enough estimation.
			
 
				-   Helgrind:Race
			
 
				-   fun: _starpu_history_based_job_expected_perf
			
 
				-   ...
			
 
				-}
			
 
				-
			
 
				-{
			
 
				    We do not care about races on profiling statistics
			
 
				    Helgrind:Race
			
 
				-   fun: starpu_profiling_status_get
			
 
				+   fun:_starpu_worker_get_status
			
 
				+   fun:_starpu_worker_reset_profiling_info_with_lock
			
 
				    ...
			
 
				 }
			
 
				 
			
 
				 {
			
 
				    This is racy, but since we'll always put the same values, this is not a problem.
			
 
				    Helgrind:Race
			
 
				-   fun: _starpu_codelet_check_deprecated_fields
			
 
				+   fun:_starpu_codelet_check_deprecated_fields
			
 
				    ...
			
 
				 }
			
 
				 
			
 
				 {
			
 
				    This is racy, but we don't care, it's only a statistic
			
 
				    Helgrind:Race
			
 
				-   fun: starpu_task_nsubmitted
			
 
				+   fun:starpu_task_nsubmitted
			
 
				    ...
			
 
				 }
			
 
				 
			
 
				 {
			
 
				    This is racy, but we don't care, it's only a statistic
			
 
				    Helgrind:Race
			
 
				-   fun: starpu_task_nready
			
 
				+   fun:starpu_task_nready
			
 
				    ...
			
 
				 }
			
 
				 
			
@@ -92,18 +72,27 @@
 
				 }
			
 
				 
			
 
				 {
			
 
				-   This is racy, but we don't care, if the function was called a bit earlier we would have had a different value
			
 
				+   This is racy, but keep it away for now, otherwise it clutters the buildbot log
			
 
				    Helgrind:Race
			
 
				-   fun: _starpu_fifo_empty
			
 
				-   fun: pop_task_eager_policy
			
 
				+   fun:_starpu_fifo_empty
			
 
				+   fun:pop_task_eager_policy
			
 
				    ...
			
 
				 }
			
 
				 
			
 
				 {
			
 
				    This is the counterpart of the suppression above
			
 
				    Helgrind:Race
			
 
				-   fun: _starpu_fifo_push_task
			
 
				-   fun: push_task_eager_policy
			
 
				+   fun:_starpu_fifo_push_task
			
 
				+   fun:push_task_eager_policy
			
 
				    ...
			
 
				 }
			
 
				 
			
 
				+
			
 
				+{
			
 
				+   This is the counterpart of the suppression above
			
 
				+   Helgrind:Race
			
 
				+   fun:_starpu_fifo_push_sorted_task
			
 
				+   fun:_starpu_fifo_push_task
			
 
				+   fun:push_task_eager_policy
			
 
				+   ...
			
 
				+}