лет назад: 13 · a007c4d009
--- a/ChangeLog
+++ b/ChangeLog
@@ -116,6 +116,9 @@ Small features:
 
																   * File STARPU-REVISION --- containing the SVN revision number from which
															
 
																     StarPU was compiled --- is installed in the share/doc/starpu directory
															
 
																   * starpu_perfmodel_plot can now directly draw GFlops curves.
															
 
																+  * New configure option --enable-mpi-progression-hook to enable the
															
 
																+    activity polling method for StarPU-MPI.
															
 
																+  * Permit to disable sequential consistency for a given task.
															
 
																 Changes:
															
 
																   * Fix the block filter functions.
															
--- a/configure.ac
+++ b/configure.ac
@@ -221,6 +221,7 @@ AM_CONDITIONAL([STARPU_LONG_CHECK], [test "x$enable_long_check" = "xyes"])
 
																 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
															
 
																 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
															
 
																+AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
															
 
																 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
															
 
																 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
															
@@ -1234,6 +1235,13 @@ if test x$use_mpi = xyes; then
 
																 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
															
 
																 fi
															
 
																+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
															
 
																+				   [Enable StarPU MPI activity polling method])],
															
 
																+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
															
 
																+if  test x$enable_mpi_progression_hook = xyes; then
															
 
																+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
															
 
																+fi
															
 
																+
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																 #                               StarPU-Top                                    #
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1805,9 +1805,15 @@ contained in the @code{tag_id} field. Tag allow the application to synchronize
 
																 with the task and to express task dependencies easily.
															
 
																 @item @code{starpu_tag_t tag_id}
															
 
																-This fields contains the tag associated to the task if the @code{use_tag} field
															
 
																+This field contains the tag associated to the task if the @code{use_tag} field
															
 
																 was set, it is ignored otherwise.
															
 
																+@item @code{unsigned sequential_consistency}
															
 
																+If this flag is set (which is the default), sequential consistency is enforced
															
 
																+for the data parameters of this task for which sequential consistency is
															
 
																+enabled. Clearing this flag permits to disable sequential consistency for this
															
 
																+task, even if data have it enabled.
															
 
																+
															
 
																 @item @code{unsigned synchronous}
															
 
																 If this flag is set, the @code{starpu_task_submit} function is blocking and
															
 
																 returns only when the task has been executed (or if no worker is able to
															
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -209,10 +209,14 @@ enabled when the GCC compiler provides a plug-in support.
 
																 @end defvr
															
 
																 @defvr {Configure option} --with-mpicc=@var{path}
															
 
																-Use the @command{mpicc} compiler at @var{path}, for starpumpi
															
 
																+Use the @command{mpicc} compiler at @var{path}, for StarPU-MPI.
															
 
																 (@pxref{StarPU MPI support}).
															
 
																 @end defvr
															
 
																+@defvr {Configure option} --enable-mpi-progression-hook
															
 
																+Enable the activity polling method for StarPU-MPI.
															
 
																+@end defvr
															
 
																+
															
 
																 @node Advanced configuration
															
 
																 @subsection Advanced configuration
															
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -106,7 +106,7 @@ static int start_task_grid(unsigned iter)
 
																 	return 0;
															
 
																 }
															
 
																-void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
															
 
																+void cpu_codelet(void *descr[] __attribute__((unused)), void *_args __attribute__((unused)))
															
 
																 {
															
 
																 /*	int i = (uintptr_t) _args;
															
 
																 	printf("doing %x\n", i);
															
@@ -117,7 +117,7 @@ void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
 
																 int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
															
 
																 {
															
 
																-	unsigned i;
															
 
																+	unsigned i, j;
															
 
																 	int ret;
															
 
																 	ret = starpu_init(NULL);
															
@@ -161,8 +161,9 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
																 	FPRINTF(stderr, "TEST DONE ...\n");
															
 
																 enodev:
															
 
																-	for (i = 0; i < Nrolls; i++)
															
 
																-	{
															
 
																+	for (i = 0; i < Nrolls; i++) {
															
 
																+		for (j = 0; j < ni; j++)
															
 
																+			starpu_task_destroy(tasks[i][j]);
															
 
																 		free(tasks[i]);
															
 
																 	}
															
--- a/include/starpu_profiling.h
+++ b/include/starpu_profiling.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -101,8 +101,15 @@ int starpu_profiling_status_set(int status);
 
																  * error. */
															
 
																 int starpu_profiling_status_get(void);
															
 
																 #ifdef BUILDING_STARPU
															
 
																+#include <common/utils.h>
															
 
																 extern int _starpu_profiling;
															
 
																-#define starpu_profiling_status_get() _starpu_profiling
															
 
																+#define starpu_profiling_status_get() ({ \
															
 
																+	int __ret; \
															
 
																+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling); \
															
 
																+	__ret = _starpu_profiling; \
															
 
																+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling); \
															
 
																+	__ret; \
															
 
																+})
															
 
																 #endif
															
 
																 /* Get the profiling info associated to a worker, and reset the profiling
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -129,9 +129,14 @@ struct starpu_task
 
																 	void (*callback_func)(void *);
															
 
																 	void *callback_arg;
															
 
																+	/* Whether tag_id should be considered */
															
 
																 	unsigned use_tag;
															
 
																+	/* Tag associated with this task */
															
 
																 	starpu_tag_t tag_id;
															
 
																+	/* Whether we should enforce sequential consistency for this task */
															
 
																+	unsigned sequential_consistency;
															
 
																+
															
 
																 	/* options for the task execution */
															
 
																 	unsigned synchronous; /* if set, a call to push is blocking */
															
 
																 	int priority; /* STARPU_MAX_PRIO = most important; STARPU_MIN_PRIO = least important */
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -186,6 +186,14 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 
																 #define STARPU_WMB() STARPU_SYNCHRONIZE()
															
 
																 #endif
															
 
																+/* This is needed in some places to make valgrind yield to another thread to be
															
 
																+ * able to progress.  */
															
 
																+#if defined(__i386__) || defined(__x86_64__)
															
 
																+#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
															
 
																+#else
															
 
																+#define STARPU_UYIELD() ((void)0)
															
 
																+#endif
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+# Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -88,7 +88,7 @@ examplebin_PROGRAMS +=				\
 
																 	stencil/stencil5
															
 
																 stencil_stencil5_LDADD =		\
															
 
																-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
															
 
																 starpu_mpi_EXAMPLES	+=	\
															
 
																 	stencil/stencil5
															
@@ -106,7 +106,7 @@ examplebin_PROGRAMS += 			\
 
																 mpi_lu_plu_example_float_LDADD =	\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																 	$(STARPU_LIBNUMA_LDFLAGS)				\
															
 
																-	$(STARPU_BLAS_LDFLAGS)
															
 
																+	$(STARPU_BLAS_LDFLAGS) -lm
															
 
																 mpi_lu_plu_example_float_SOURCES =	\
															
 
																 	mpi_lu/plu_example_float.c	\
															
@@ -118,7 +118,7 @@ mpi_lu_plu_example_float_SOURCES =	\
 
																 mpi_lu_plu_example_double_LDADD =	\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																 	$(STARPU_LIBNUMA_LDFLAGS)				\
															
 
																-	$(STARPU_BLAS_LDFLAGS)
															
 
																+	$(STARPU_BLAS_LDFLAGS) -lm
															
 
																 mpi_lu_plu_example_double_SOURCES =	\
															
 
																 	mpi_lu/plu_example_double.c	\
															
@@ -148,7 +148,7 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 
																 matrix_decomposition_mpi_cholesky_LDADD =			\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																-	$(STARPU_BLAS_LDFLAGS)
															
 
																+	$(STARPU_BLAS_LDFLAGS) -lm
															
 
																 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
															
 
																 	matrix_decomposition/mpi_cholesky_distributed.c	\
															
@@ -161,7 +161,7 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 
																 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																-	$(STARPU_BLAS_LDFLAGS)
															
 
																+	$(STARPU_BLAS_LDFLAGS) -lm
															
 
																 starpu_mpi_EXAMPLES +=				\
															
 
																 	matrix_decomposition/mpi_cholesky			\
															
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -37,12 +37,14 @@ struct starpu_codelet stencil5_cl =
 
																 };
															
 
																 #ifdef STARPU_QUICK_CHECK
															
 
																-#  define NITER_DEF	10
															
 
																+#  define NITER_DEF	5
															
 
																+#  define X         	3
															
 
																+#  define Y         	3
															
 
																 #else
															
 
																 #  define NITER_DEF	500
															
 
																+#  define X         	20
															
 
																+#  define Y         	20
															
 
																 #endif
															
 
																-#define X         20
															
 
																-#define Y         20
															
 
																 int display = 0;
															
 
																 int niter = NITER_DEF;
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -23,11 +23,7 @@
 
																 #include <starpu_profiling.h>
															
 
																 #include <starpu_mpi_stats.h>
															
 
																 #include <starpu_mpi_insert_task.h>
															
 
																-
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#  warning TODO find a better way to select the polling method (perhaps during the configuration)
															
 
																-#endif
															
 
																-//#define USE_STARPU_ACTIVITY	1
															
 
																+#include <common/config.h>
															
 
																 static void _starpu_mpi_submit_new_mpi_request(void *arg);
															
 
																 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
															
@@ -643,6 +639,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
																 		MPI_Status status;
															
 
																 		memset(&status, 0, sizeof(MPI_Status));
															
 
																 		req->ret = MPI_Recv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &status);
															
 
																+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	}
															
 
																 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ || req->request_type == PROBE_REQ)
															
@@ -699,7 +696,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
																 	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																-#ifdef USE_STARPU_ACTIVITY
															
 
																+#ifdef STARPU_MPI_ACTIVITY
															
 
																 static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
															
 
																 {
															
 
																 	unsigned may_block = 1;
															
@@ -714,7 +711,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
																 	return may_block;
															
 
																 }
															
 
																-#endif
															
 
																+#endif /* STARPU_MPI_ACTIVITY */
															
 
																 static void _starpu_mpi_test_detached_requests(void)
															
 
																 {
															
@@ -885,9 +882,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		/* shall we block ? */
															
 
																 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
															
 
																-#ifndef USE_STARPU_ACTIVITY
															
 
																+#ifndef STARPU_MPI_ACTIVITY
															
 
																 		block = block && _starpu_mpi_req_list_empty(detached_requests);
															
 
																-#endif
															
 
																+#endif /* STARPU_MPI_ACTIVITY */
															
 
																 		if (block)
															
 
																 		{
															
@@ -946,20 +943,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 /*                                                      */
															
 
																 /********************************************************/
															
 
																-#ifdef USE_STARPU_ACTIVITY
															
 
																+#ifdef STARPU_MPI_ACTIVITY
															
 
																 static int hookid = - 1;
															
 
																-#endif
															
 
																+#endif /* STARPU_MPI_ACTIVITY */
															
 
																 static void _starpu_mpi_add_sync_point_in_fxt(void)
															
 
																 {
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 	int rank;
															
 
																 	int worldsize;
															
 
																+	int ret;
															
 
																+
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																-	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																-	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+	ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																 	/* We generate a "unique" key so that we can make sure that different
															
 
																 	 * FxT traces come from the same MPI run. */
															
@@ -973,7 +972,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
																 		random_number = rand();
															
 
																 	}
															
 
																-	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
															
 
																+	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
															
@@ -1006,10 +1006,10 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
																 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-#ifdef USE_STARPU_ACTIVITY
															
 
																+#ifdef STARPU_MPI_ACTIVITY
															
 
																 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
															
 
																 	STARPU_ASSERT(hookid >= 0);
															
 
																-#endif
															
 
																+#endif /* STARPU_MPI_ACTIVITY */
															
 
																 	_starpu_mpi_add_sync_point_in_fxt();
															
 
																 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
															
@@ -1058,9 +1058,9 @@ int starpu_mpi_shutdown(void)
 
																 	pthread_join(progress_thread, &value);
															
 
																-#ifdef USE_STARPU_ACTIVITY
															
 
																+#ifdef STARPU_MPI_ACTIVITY
															
 
																 	starpu_progression_hook_deregister(hookid);
															
 
																-#endif
															
 
																+#endif /* STARPU_MPI_ACTIVITY */
															
 
																 	TRACE_MPI_STOP(rank, world_size);
															
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -42,26 +42,23 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
																 {
															
 
																 	int rank;
															
 
																 	int x;
															
 
																-	struct _callback_arg *callback_arg;
															
 
																-	void (*callback_func)(void *);
															
 
																+	struct _callback_arg *callback_arg = NULL;
															
 
																+	void (*callback_func)(void *) = NULL;
															
 
																+	void (*callback)(void *);
															
 
																 	MPI_Comm_rank(comm, &rank);
															
 
																-	callback_func = _callback_collective;
															
 
																-	callback_arg = malloc(sizeof(struct _callback_arg));
															
 
																-	callback_arg->count = 0;
															
 
																-	callback_arg->nb = 0;
															
 
																-	callback_arg->callback = (rank == root) ? scallback : rcallback;
															
 
																-	callback_arg->arg = (rank == root) ? sarg : rarg;
															
 
																-	if (callback_arg->callback == NULL)
															
 
																+	callback = (rank == root) ? scallback : rcallback;
															
 
																+	if (callback)
															
 
																 	{
															
 
																-		free(callback_arg);
															
 
																-		callback_arg = NULL;
															
 
																-		callback_func = NULL;
															
 
																-	}
															
 
																+		callback_func = _callback_collective;
															
 
																+		callback_arg = malloc(sizeof(struct _callback_arg));
															
 
																+		callback_arg->count = 0;
															
 
																+		callback_arg->nb = 0;
															
 
																+		callback_arg->callback = (rank == root) ? scallback : rcallback;
															
 
																+		callback_arg->arg = (rank == root) ? sarg : rarg;
															
 
																+		if (callback_arg->callback == NULL)
															
 
																-	if (callback_arg)
															
 
																-	{
															
 
																 		for(x = 0; x < count ; x++)
															
 
																 		{
															
 
																 			if (data_handles[x])
															
@@ -107,29 +104,23 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
																 {
															
 
																 	int rank;
															
 
																 	int x;
															
 
																-	struct _callback_arg *callback_arg;
															
 
																-	void (*callback_func)(void *);
															
 
																+	struct _callback_arg *callback_arg = NULL;
															
 
																+	void (*callback_func)(void *) = NULL;
															
 
																+	void (*callback)(void *);
															
 
																 	MPI_Comm_rank(comm, &rank);
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning TODO: callback_arg needs to be free-ed
															
 
																-#endif
															
 
																-	callback_func = _callback_collective;
															
 
																-	callback_arg = malloc(sizeof(struct _callback_arg));
															
 
																-	callback_arg->count = 0;
															
 
																-	callback_arg->nb = 0;
															
 
																-	callback_arg->callback = (rank == root) ? scallback : rcallback;
															
 
																-	callback_arg->arg = (rank == root) ? sarg : rarg;
															
 
																-	if (callback_arg->callback == NULL)
															
 
																+	callback = (rank == root) ? scallback : rcallback;
															
 
																+	if (callback)
															
 
																 	{
															
 
																-		free(callback_arg);
															
 
																-		callback_arg = NULL;
															
 
																-		callback_func = NULL;
															
 
																-	}
															
 
																+		callback_func = _callback_collective;
															
 
																+
															
 
																+		callback_arg = malloc(sizeof(struct _callback_arg));
															
 
																+		callback_arg->count = 0;
															
 
																+		callback_arg->nb = 0;
															
 
																+		callback_arg->callback = callback;
															
 
																+		callback_arg->arg = (rank == root) ? sarg : rarg;
															
 
																-	if (callback_arg)
															
 
																-	{
															
 
																 		for(x = 0; x < count ; x++)
															
 
																 		{
															
 
																 			if (data_handles[x])
															
@@ -170,4 +161,3 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
 
																-
															
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -45,6 +45,8 @@ void callback(void *arg __attribute__((unused)))
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																+	return 77;
															
 
																+	/*
															
 
																 	int ret, rank, size;
															
 
																 	MPI_Init(NULL, NULL);
															
@@ -99,4 +101,5 @@ int main(int argc, char **argv)
 
																 	MPI_Finalize();
															
 
																 	return 0;
															
 
																+	*/
															
 
																 }
															
--- a/mpi/tests/user_defined_datatype.c
+++ b/mpi/tests/user_defined_datatype.c
@@ -24,36 +24,36 @@
 
																 #  define ELEMENTS 1000
															
 
																 #endif
															
 
																-typedef void (*test_func)(starpu_data_handle_t *, int, int);
															
 
																+typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
															
 
																-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank)
															
 
																+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
															
 
																 {
															
 
																 	int i;
															
 
																 	for(i=0 ; i<nb_handles ; i++)
															
 
																 	{
															
 
																 		starpu_data_set_rank(handles[i], 1);
															
 
																-		starpu_data_set_tag(handles[i], i+100);
															
 
																+		starpu_data_set_tag(handles[i], i+tag);
															
 
																 	}
															
 
																 	for(i=0 ; i<nb_handles ; i++)
															
 
																 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
															
 
																 }
															
 
																-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank)
															
 
																+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
															
 
																 {
															
 
																 	int i;
															
 
																 	if (rank == 1)
															
 
																 	{
															
 
																 		for(i=0 ; i<nb_handles ; i++)
															
 
																-			starpu_mpi_send(handles[i], 0, i+100, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_send(handles[i], 0, i+tag, MPI_COMM_WORLD);
															
 
																 	}
															
 
																 	else if (rank == 0)
															
 
																 	{
															
 
																 		MPI_Status statuses[nb_handles];
															
 
																 		for(i=0 ; i<nb_handles ; i++)
															
 
																-			starpu_mpi_recv(handles[i], 1, i+100, MPI_COMM_WORLD, &statuses[i]);
															
 
																+			starpu_mpi_recv(handles[i], 1, i+tag, MPI_COMM_WORLD, &statuses[i]);
															
 
																 	}
															
 
																 }
															
@@ -126,8 +126,8 @@ int main(int argc, char **argv)
 
																 				starpu_variable_data_register(&handle_vars[i], 0, (uintptr_t)&foo[i], sizeof(double));
															
 
																 			}
															
 
																-			f(handle_vars, ELEMENTS, rank);
															
 
																-			f(handle_complex, ELEMENTS, rank);
															
 
																+			f(handle_vars, ELEMENTS, rank, ELEMENTS);
															
 
																+			f(handle_complex, ELEMENTS, rank, 4*ELEMENTS);
															
 
																 			for(i=0 ; i<ELEMENTS ; i++)
															
 
																 			{
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
@@ -21,8 +21,10 @@
 
																 static struct bound_task_pool *task_pools = NULL;
															
 
																 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger);
															
 
																-static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
															
 
																+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger,
															
 
																+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs);
															
 
																+static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], 
															
 
																+						     int *sched_ctxs, int *workers, struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
															
 
																 {
															
 
																 	double draft_tasks[nw][nt];
															
 
																 	double draft_w_in_s[ns][nw];
															
@@ -45,7 +47,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
																 	/* smallest possible tmax, difficult to obtain as we
															
 
																 	   compute the nr of flops and not the tasks */
															
 
																 	double possible_tmax = _lp_get_tmax(nw, workers);
															
 
																-	double smallest_tmax = possible_tmax / 2;
															
 
																+	double smallest_tmax = possible_tmax / 3;
															
 
																 	double tmax = possible_tmax * ns;
															
 
																 	double res = 1.0;
															
 
																 	unsigned has_sol = 0;
															
@@ -53,6 +55,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
																 	double old_tmax = 0.0;
															
 
																 	unsigned found_sol = 0;
															
 
																+//	printf("tmin = %lf tmax = %lf \n", tmin, tmax);
															
 
																 	struct timeval start_time;
															
 
																 	struct timeval end_time;
															
 
																 	int nd = 0;
															
@@ -65,7 +68,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
																 		/* find solution and save the values in draft tables
															
 
																 		   only if there is a solution for the system we save them
															
 
																 		   in the proper table */
															
 
																-		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1);
															
 
																+		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1, tmp_task_pools, size_ctxs);
															
 
																 		if(res != 0.0)
															
 
																 		{
															
 
																 			for(w = 0; w < nw; w++)
															
@@ -129,7 +132,7 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
																 	double w_in_s[ns][nw];
															
 
																 	double tasks[nw][nt];
															
 
																-	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers);
															
 
																+	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers, task_pools, 1);
															
 
																 	pthread_mutex_unlock(&mutex);
															
 
																 	/* if we did find at least one solution redistribute the resources */
															
 
																 	if(found_sol)
															
@@ -194,7 +197,6 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 
																 static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
															
 
																 {
															
 
																 	/* count the tasks of the same type */
															
 
																-	pthread_mutex_lock(&mutex);
															
 
																 	struct bound_task_pool *tp = NULL;
															
 
																 	for (tp = task_pools; tp; tp = tp->next)
															
@@ -209,20 +211,36 @@ static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 
																 			tp->n--;
															
 
																 		else
															
 
																 		{
															
 
																-			struct bound_task_pool *prev_tp = NULL;
															
 
																-			for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
															
 
																+			if(tp == task_pools)
															
 
																 			{
															
 
																-				if (prev_tp->next == tp)
															
 
																-					prev_tp->next = tp->next;
															
 
																+				struct bound_task_pool *next_tp = NULL;
															
 
																+				if(task_pools->next)
															
 
																+					next_tp = task_pools->next;
															
 
																+
															
 
																+				free(tp);
															
 
																+				tp = NULL;
															
 
																+				
															
 
																+				if(next_tp)
															
 
																+					task_pools = next_tp;
															
 
																+				
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				struct bound_task_pool *prev_tp = NULL;
															
 
																+				for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
															
 
																+				{
															
 
																+					if (prev_tp->next == tp)
															
 
																+						prev_tp->next = tp->next;
															
 
																+				}
															
 
																+				
															
 
																+				free(tp);
															
 
																+				tp = NULL;
															
 
																 			}
															
 
																-
															
 
																-			free(tp);
															
 
																 		}
															
 
																 	}
															
 
																-	pthread_mutex_unlock(&mutex);
															
 
																 }
															
 
																-static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
															
 
																+static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs)
															
 
																 {
															
 
																         struct bound_task_pool *tp;
															
 
																         int w, t;
															
@@ -230,14 +248,33 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
 
																         {
															
 
																                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																                 {
															
 
																-                        enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_perf_archtype(w) :
															
 
																-				starpu_worker_get_perf_archtype(workers[w]);
															
 
																+			int worker = workers == NULL ? w : workers[w];
															
 
																+                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(worker);
															
 
																                         double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
															
 
																                         if (isnan(length))
															
 
																                                 times[w][t] = NAN;
															
 
																-                       else
															
 
																+			else
															
 
																+			{
															
 
																                                 times[w][t] = length / 1000.;
															
 
																+
															
 
																+				double transfer_time = 0.0;
															
 
																+				enum starpu_archtype arch = starpu_worker_get_type(worker);
															
 
																+				if(arch == STARPU_CUDA_WORKER)
															
 
																+				{
															
 
																+					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
															
 
																+					if(!worker_in_ctx && !size_ctxs)
															
 
																+					{
															
 
																+						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
															
 
																+						transfer_time +=  (tp->footprint / transfer_velocity) / 1000. ;
															
 
																+					}
															
 
																+					double latency = starpu_get_latency_RAM_CUDA(worker);
															
 
																+					transfer_time += latency/1000.;
															
 
																+
															
 
																+				}
															
 
																+//				printf("%d/%d %s x %d time = %lf transfer_time = %lf\n", w, tp->sched_ctx_id, tp->cl->model->symbol, tp->n, times[w][t], transfer_time);
															
 
																+				times[w][t] += transfer_time;
															
 
																+			}
															
 
																                 }
															
 
																         }
															
 
																 }
															
@@ -247,9 +284,10 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
 
																  */
															
 
																 #ifdef STARPU_HAVE_GLPK_H
															
 
																 #include <glpk.h>
															
 
																-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer)
															
 
																+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer,
															
 
																+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
															
 
																 {
															
 
																-	if(task_pools == NULL)
															
 
																+	if(tmp_task_pools == NULL)
															
 
																 		return 0.0;
															
 
																 	struct bound_task_pool * tp;
															
 
																 	int t, w, s;
															
@@ -270,7 +308,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 		int ia[ne], ja[ne];
															
 
																 		double ar[ne];
															
 
																-		_get_tasks_times(nw, nt, times, workers);
															
 
																+		_get_tasks_times(nw, nt, times, workers, size_ctxs);
															
 
																 		/* Variables: number of tasks i assigned to worker j, and tmax */
															
 
																 		glp_add_cols(lp, nw*nt+ns*nw);
															
@@ -280,7 +318,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
															
 
																 		for (w = 0; w < nw; w++)
															
 
																-			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+			for (t = 0; t < nt; t++)
															
 
																 			{
															
 
																 				char name[32];
															
 
																 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
															
@@ -313,7 +351,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 		int curr_row_idx = 0;
															
 
																 		/* Total worker execution time */
															
 
																 		glp_add_rows(lp, nw*ns);
															
 
																-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		for (t = 0; t < nt; t++)
															
 
																 		{
															
 
																 			int someone = 0;
															
 
																 			for (w = 0; w < nw; w++)
															
@@ -336,7 +374,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 				starpu_worker_get_name(w, name, sizeof(name));
															
 
																 				snprintf(title, sizeof(title), "worker %s", name);
															
 
																 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
															
 
																-				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+				for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
															
 
																 				{
															
 
																 					if((int)tp->sched_ctx_id == sched_ctxs[s])
															
 
																 					{
															
@@ -362,7 +400,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 		/* Total task completion */
															
 
																 		glp_add_rows(lp, nt);
															
 
																-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
															
 
																 		{
															
 
																 			char name[32], title[64];
															
 
																 			starpu_worker_get_name(w, name, sizeof(name));
															
@@ -411,6 +449,12 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 	glp_init_smcp(&parm);
															
 
																 	parm.msg_lev = GLP_MSG_OFF;
															
 
																 	int ret = glp_simplex(lp, &parm);
															
 
																+
															
 
																+/* 	char str[50]; */
															
 
																+/* 	sprintf(str, "outpu_lp_%g", tmax); */
															
 
																+
															
 
																+/* 	glp_print_sol(lp, str); */
															
 
																+
															
 
																 	if (ret)
															
 
																 	{
															
 
																 		printf("error in simplex\n");
															
@@ -449,7 +493,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 	double res = glp_get_obj_val(lp);
															
 
																 	for (w = 0; w < nw; w++)
															
 
																-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		for (t = 0; t < nt; t++)
															
 
																 /* 			if (integer) */
															
 
																 /* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
															
 
																 /*                         else */
															
@@ -471,10 +515,18 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
																 	return res;
															
 
																 }
															
 
																+static struct bound_task_pool* _clone_linked_list(struct bound_task_pool *tp)
															
 
																+{
															
 
																+	if(tp == NULL) return NULL;
															
 
																+
															
 
																+	struct bound_task_pool *tmp_tp = (struct bound_task_pool*)malloc(sizeof(struct bound_task_pool));
															
 
																+	memcpy(tmp_tp, tp, sizeof(struct bound_task_pool));
															
 
																+	tmp_tp->next = _clone_linked_list(tp->next);
															
 
																+	return tmp_tp;
															
 
																+}
															
 
																 static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
															
 
																 {
															
 
																-	_remove_task_from_pool(task, footprint);
															
 
																 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
															
 
																 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
															
@@ -491,24 +543,50 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 
																 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																 			int nw = starpu_worker_get_count(); /* Number of different workers */
															
 
																 			int nt = 0; /* Number of different kinds of tasks */
															
 
																-			pthread_mutex_lock(&mutex);
															
 
																-			struct bound_task_pool * tp;
															
 
																+
															
 
																+//			pthread_mutex_lock(&mutex);
															
 
																+
															
 
																+			/* we don't take the mutex bc a correct value of the number of tasks is
															
 
																+			   not required but we do a copy in order to be sure
															
 
																+			   that the linear progr won't segfault if the list of 
															
 
																+			   submitted task will change during the exec */
															
 
																+
															
 
																+			struct bound_task_pool *tp = NULL;
															
 
																+			struct bound_task_pool *tmp_task_pools = _clone_linked_list(task_pools);
															
 
																+
															
 
																 			for (tp = task_pools; tp; tp = tp->next)
															
 
																 				nt++;
															
 
																+
															
 
																 			double w_in_s[ns][nw];
															
 
																 			double tasks_per_worker[nw][nt];
															
 
																-			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
															
 
																-			pthread_mutex_unlock(&mutex);
															
 
																+			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL, tmp_task_pools, 0);
															
 
																+//			pthread_mutex_unlock(&mutex);
															
 
																+
															
 
																 			/* if we did find at least one solution redistribute the resources */
															
 
																 			if(found_sol)
															
 
																 				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
															
 
																+			struct bound_task_pool *next = NULL;
															
 
																+			struct bound_task_pool *tmp_tp = tmp_task_pools;
															
 
																+			while(tmp_task_pools)
															
 
																+			{
															
 
																+				next = tmp_tp->next;
															
 
																+				free(tmp_tp);
															
 
																+				tmp_tp = next;
															
 
																+				tmp_task_pools = next;
															
 
																+			}
															
 
																+			
															
 
																 		}
															
 
																 		pthread_mutex_unlock(&act_hypervisor_mutex);
															
 
																 	}
															
 
																+	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
															
 
																+//	pthread_mutex_lock(&mutex);
															
 
																+	_remove_task_from_pool(task, footprint);
															
 
																+//	pthread_mutex_unlock(&mutex);
															
 
																+
															
 
																 }
															
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -82,6 +82,7 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 
																 		/* Give hand to another thread, hopefully the one which has the
															
 
																 		 * spinlock and probably just has also a short-lived mutex. */
															
 
																 		MSG_process_sleep(0.000001);
															
 
																+		STARPU_UYIELD();
															
 
																 	}
															
 
																 #elif defined(STARPU_SPINLOCK_CHECK)
															
 
																 	int ret = pthread_mutex_lock(&lock->errcheck_lock);
															
@@ -96,6 +97,8 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 
																 	do
															
 
																 	{
															
 
																 		prev = STARPU_TEST_AND_SET(&lock->taken, 1);
															
 
																+		if (prev)
															
 
																+			STARPU_UYIELD();
															
 
																 	}
															
 
																 	while (prev);
															
 
																 	return 0;
															
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -31,6 +31,54 @@
 
																 #include <msg/msg.h>
															
 
																 #endif
															
 
																+#ifdef STARPU_HAVE_HELGRIND_H
															
 
																+#include <valgrind/helgrind.h>
															
 
																+#endif
															
 
																+
															
 
																+#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
															
 
																+#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
															
 
																+#endif
															
 
																+#ifndef VALGRIND_HG_MUTEX_LOCK_POST
															
 
																+#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
															
 
																+#endif
															
 
																+#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
															
 
																+#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
															
 
																+#endif
															
 
																+#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
															
 
																+#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
															
 
																+#endif
															
 
																+#ifndef DO_CREQ_v_WW
															
 
																+#define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
															
 
																+#endif
															
 
																+#ifndef DO_CREQ_v_W
															
 
																+#define DO_CREQ_v_W(_creqF, _ty1F, _arg1F) ((void)0)
															
 
																+#endif
															
 
																+#ifndef ANNOTATE_HAPPENS_BEFORE
															
 
																+#define ANNOTATE_HAPPENS_BEFORE(obj) ((void)0)
															
 
																+#endif
															
 
																+#ifndef ANNOTATE_HAPPENS_AFTER
															
 
																+#define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
															
 
																+#endif
															
 
																+#ifndef ANNOTATE_RWLOCK_ACQUIRED
															
 
																+#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
															
 
																+#endif
															
 
																+#ifndef ANNOTATE_RWLOCK_RELEASED
															
 
																+#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
															
 
																+#endif
															
 
																+
															
 
																+#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
															
 
																+	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
															
 
																+			struct _starpu_spinlock *, lock, long, 0)
															
 
																+#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
															
 
																+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
															
 
																+			struct _starpu_spinlock *, lock)
															
 
																+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
															
 
																+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
															
 
																+			struct _starpu_spinlock *, lock)
															
 
																+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
															
 
																+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
															
 
																+			struct _starpu_spinlock *, lock)
															
 
																+
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
															
 
																 #else
															
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -307,6 +307,9 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 
																 	STARPU_ASSERT(task->cl);
															
 
																         _STARPU_LOG_IN();
															
 
																+	if (!task->sequential_consistency)
															
 
																+		return;
															
 
																+
															
 
																 	/* We don't want to enforce a sequential consistency for tasks that are
															
 
																 	 * not visible to the application. */
															
 
																 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -178,10 +178,8 @@ void _starpu_tag_clear(void)
 
																 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
															
 
																 }
															
 
																-static struct _starpu_tag *gettag_struct(starpu_tag_t id)
															
 
																+static struct _starpu_tag *_gettag_struct(starpu_tag_t id)
															
 
																 {
															
 
																-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
															
 
																-
															
 
																 	/* search if the tag is already declared or not */
															
 
																 	struct _starpu_tag_table *entry;
															
 
																 	struct _starpu_tag *tag;
															
@@ -212,8 +210,15 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 
																 #endif
															
 
																 	}
															
 
																-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
															
 
																+	return tag;
															
 
																+}
															
 
																+static struct _starpu_tag *gettag_struct(starpu_tag_t id)
															
 
																+{
															
 
																+	struct _starpu_tag *tag;
															
 
																+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
															
 
																+	tag = _gettag_struct(id);
															
 
																+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
															
 
																 	return tag;
															
 
																 }
															
@@ -432,10 +437,11 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
																 		return -EDEADLK;
															
 
																 	}
															
 
																+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
															
 
																 	/* only wait the tags that are not done yet */
															
 
																 	for (i = 0, current = 0; i < ntags; i++)
															
 
																 	{
															
 
																-		struct _starpu_tag *tag = gettag_struct(id[i]);
															
 
																+		struct _starpu_tag *tag = _gettag_struct(id[i]);
															
 
																 		_starpu_spin_lock(&tag->lock);
															
@@ -450,6 +456,7 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
																 			current++;
															
 
																 		}
															
 
																 	}
															
 
																+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
															
 
																 	if (current == 0)
															
 
																 	{
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -942,6 +942,7 @@ int starpu_perfmodel_list(FILE *output)
 
																 /* This function is intended to be used by external tools that should read the
															
 
																  * performance model files */
															
 
																+/* TODO: write an clear function, to free symbol and history */
															
 
																 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
															
 
																 {
															
 
																 	model->symbol = strdup(symbol);
															
@@ -1064,6 +1065,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
																 		HASH_FIND_UINT32_T(history, &key, entry);
															
 
																 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
															
 
																+		/* We do not care about racing access to the mean, we only want a
															
 
																+		 * good-enough estimation, thus simulate taking the rdlock */
															
 
																+		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
															
 
																+
															
 
																 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
															
 
																 			exp = entry->history_entry->mean;
															
 
																 		else if (!model->benchmarking)
															
@@ -1075,6 +1080,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
																 			_starpu_set_calibrate_flag(1);
															
 
																 			model->benchmarking = 1;
															
 
																 		}
															
 
																+		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
															
 
																 	}
															
 
																 	return exp;
															
@@ -1097,6 +1103,10 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
																 	entry = (elt == NULL) ? NULL : elt->history_entry;
															
 
																 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
															
 
																+	/* We do not care about racing access to the mean, we only want a
															
 
																+	 * good-enough estimation, thus simulate taking the rdlock */
															
 
																+	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
															
 
																+
															
 
																 	exp = entry?entry->mean:NAN;
															
 
																 	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
															
@@ -1115,6 +1125,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
																 		model->benchmarking = 1;
															
 
																 	}
															
 
																+	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
															
 
																+
															
 
																 	return exp;
															
 
																 }
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -490,7 +490,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
																 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																 	unsigned nworkers = config->topology.nworkers;
															
 
																-	if(nworkers_ctx > 0 && inheritor_sched_ctx_id != STARPU_NMAX_SCHED_CTXS && 
															
 
																+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
															
 
																 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
															
 
																 	{
															
 
																 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -57,6 +57,8 @@ void starpu_task_init(struct starpu_task *task)
 
																 	 * everywhere */
															
 
																 	memset(task, 0, sizeof(struct starpu_task));
															
 
																+	task->sequential_consistency = 1;
															
 
																+
															
 
																 	/* Now we can initialise fields which recquire custom value */
															
 
																 #if STARPU_DEFAULT_PRIO != 0
															
 
																 	task->priority = STARPU_DEFAULT_PRIO;
															
@@ -707,7 +709,11 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
																 	if (--nsubmitted == 0)
															
 
																 	{
															
 
																 		if (!config->submitting)
															
 
																+		{
															
 
																+			ANNOTATE_HAPPENS_AFTER(&config->running);
															
 
																 			config->running = 0;
															
 
																+			ANNOTATE_HAPPENS_BEFORE(&config->running);
															
 
																+		}
															
 
																 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																 	}
															
@@ -727,7 +733,9 @@ starpu_drivers_request_termination(void)
 
																 	config->submitting = 0;
															
 
																 	if (nsubmitted == 0)
															
 
																 	{
															
 
																+		ANNOTATE_HAPPENS_AFTER(&config->running);
															
 
																 		config->running = 0;
															
 
																+		ANNOTATE_HAPPENS_BEFORE(&config->running);
															
 
																 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																 	}
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -893,9 +893,13 @@ out:
 
																 unsigned _starpu_machine_is_running(void)
															
 
																 {
															
 
																+	unsigned ret;
															
 
																 	/* running is just protected by a memory barrier */
															
 
																 	STARPU_RMB();
															
 
																-	return config.running;
															
 
																+	ANNOTATE_HAPPENS_AFTER(&config.running);
															
 
																+	ret = config.running;
															
 
																+	ANNOTATE_HAPPENS_BEFORE(&config.running);
															
 
																+	return ret;
															
 
																 }
															
 
																 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
															
@@ -923,8 +927,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 
																 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
															
 
																 {
															
 
																 	/* set the flag which will tell workers to stop */
															
 
																+	ANNOTATE_HAPPENS_AFTER(&config.running);
															
 
																 	pconfig->running = 0;
															
 
																 	/* running is just protected by a memory barrier */
															
 
																+	ANNOTATE_HAPPENS_BEFORE(&config.running);
															
 
																 	STARPU_WMB();
															
 
																 	starpu_wake_all_blocked_workers();
															
 
																 }
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -17,6 +17,7 @@
 
																 #include <starpu.h>
															
 
																 #include <common/config.h>
															
 
																+#include <common/utils.h>
															
 
																 #include <datawizard/datawizard.h>
															
 
																 /* requests that have not been treated at all */
															
@@ -391,8 +392,18 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
																 	struct _starpu_data_request *r;
															
 
																 	struct _starpu_data_request_list *new_data_requests;
															
 
																+	/* Note: we here tell valgrind that list_empty (reading a pointer) is
															
 
																+	 * as safe as if we had the lock held */
															
 
																+	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
															
 
																+	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
															
 
																 	if (_starpu_data_request_list_empty(data_requests[src_node]))
															
 
																+	{
															
 
																+		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
															
 
																+		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
															
 
																 		return;
															
 
																+	}
															
 
																+	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
															
 
																+	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
															
 
																 	/* take all the entries from the request list */
															
 
																         _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -26,12 +26,14 @@
 
																 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
															
 
																 {
															
 
																-#ifdef STARPU_SIMGRID
															
 
																 #if STARPU_DEVEL
															
 
																 #warning FIXME
															
 
																 #endif
															
 
																+#ifdef STARPU_SIMGRID
															
 
																 	MSG_process_sleep(0.000010);
															
 
																 #endif
															
 
																+	STARPU_UYIELD();
															
 
																+
															
 
																 	/* in case some other driver requested data */
															
 
																 	_starpu_handle_pending_node_data_requests(memory_node);
															
 
																 	_starpu_handle_node_data_requests(memory_node, may_alloc);
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -345,13 +345,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 	/* still valid ? */
															
 
																 	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 	{
															
 
																+		struct _starpu_data_replicate *local;
															
 
																 		/* until an issue is found the data is assumed to be valid */
															
 
																 		unsigned isvalid = 1;
															
 
																 		for (child = 0; child < root_handle->nchildren; child++)
															
 
																 		{
															
 
																 			starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
															
 
																-			struct _starpu_data_replicate *local = &child_handle->per_node[node];
															
 
																+			local = &child_handle->per_node[node];
															
 
																 			if (local->state == STARPU_INVALID)
															
 
																 			{
															
@@ -359,24 +360,21 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 				isvalid = 0;
															
 
																 			}
															
 
																-			if (local->allocated && local->automatically_allocated)
															
 
																-			{
															
 
																+			if (local->mc && local->allocated && local->automatically_allocated)
															
 
																 				/* free the child data copy in a lazy fashion */
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning FIXME!! this needs access to the child interface, which was freed above!
															
 
																-#endif
															
 
																-				_starpu_request_mem_chunk_removal(child_handle, node, sizes[child]);
															
 
																-			}
															
 
																+				_starpu_request_mem_chunk_removal(child_handle, local, node, sizes[child]);
															
 
																 		}
															
 
																-		if (!root_handle->per_node[node].allocated)
															
 
																+		local = &root_handle->per_node[node];
															
 
																+
															
 
																+		if (!local->allocated)
															
 
																 			/* Even if we have all the bits, if we don't have the
															
 
																 			 * whole data, it's not valid */
															
 
																 			isvalid = 0;
															
 
																-		if (!isvalid && root_handle->per_node[node].allocated && root_handle->per_node[node].automatically_allocated)
															
 
																+		if (!isvalid && local->mc && local->allocated && local->automatically_allocated)
															
 
																 			/* free the data copy in a lazy fashion */
															
 
																-			_starpu_request_mem_chunk_removal(root_handle, node, _starpu_data_get_size(root_handle));
															
 
																+			_starpu_request_mem_chunk_removal(root_handle, local, node, _starpu_data_get_size(root_handle));
															
 
																 		/* if there was no invalid copy, the node still has a valid copy */
															
 
																 		still_valid[node] = isvalid;
															
@@ -400,6 +398,10 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
															
 
																 		_starpu_spin_unlock(&child_handle->header_lock);
															
 
																 		_starpu_spin_destroy(&child_handle->header_lock);
															
 
																+
															
 
																+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->busy_mutex);
															
 
																+		_STARPU_PTHREAD_COND_DESTROY(&child_handle->busy_cond);
															
 
																+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->sequential_consistency_mutex);
															
 
																 	}
															
 
																 	/* there is no child anymore */
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -315,7 +315,7 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 
																 	uint32_t nz = block_interface->nz;
															
 
																 	size_t elemsize = block_interface->elemsize;
															
 
																-	starpu_free_on_node(node, block_interface->ptr, nx*ny*nz*elemsize);
															
 
																+	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
															
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -480,6 +480,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
															
 
																 				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
															
 
																 			}
															
 
																+			_STARPU_PTHREAD_MUTEX_DESTROY(&arg.mutex);
															
 
																+			_STARPU_PTHREAD_COND_DESTROY(&arg.cond);
															
 
																 			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
															
 
																 		}
															
@@ -546,23 +548,49 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 	/* Wait for all requests to finish (notably WT requests) */
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
															
 
																-	while (handle->busy_count)
															
 
																+	while (1) {
															
 
																+		int busy;
															
 
																+		/* Note: we here tell valgrind that reading busy_count is as
															
 
																+		 * safe is if we had the lock held */
															
 
																+		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
															
 
																+		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
															
 
																+		busy = handle->busy_count;
															
 
																+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
															
 
																+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
															
 
																+		if (!busy)
															
 
																+			break;
															
 
																+		/* This is woken by _starpu_data_check_not_busy, always called
															
 
																+		 * after decrementing busy_count */
															
 
																 		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
															
 
																+	}
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
															
 
																 	/* Wait for finished requests to release the handle */
															
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																+	size_t size = _starpu_data_get_size(handle);
															
 
																+
															
 
																+	_starpu_data_free_interfaces(handle);
															
 
																+
															
 
																 	/* Destroy the data now */
															
 
																 	unsigned node;
															
 
																-	size_t size = _starpu_data_get_size(handle);
															
 
																 	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 	{
															
 
																+		struct _starpu_data_replicate *local = &handle->per_node[node];
															
 
																 		/* free the data copy in a lazy fashion */
															
 
																-		_starpu_request_mem_chunk_removal(handle, node, size);
															
 
																+		if (local->allocated && local->automatically_allocated)
															
 
																+			_starpu_request_mem_chunk_removal(handle, local, node, size);
															
 
																+	}
															
 
																+	unsigned worker;
															
 
																+	unsigned nworkers = starpu_worker_get_count();
															
 
																+	for (worker = 0; worker < nworkers; worker++)
															
 
																+	{
															
 
																+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
															
 
																+		/* free the data copy in a lazy fashion */
															
 
																+		if (local->allocated && local->automatically_allocated)
															
 
																+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
															
 
																 	}
															
 
																-	_starpu_data_free_interfaces(handle);
															
 
																 	_starpu_memory_stats_free(handle);
															
 
																 	_starpu_data_requester_list_delete(handle->req_list);
															
 
																 	_starpu_data_requester_list_delete(handle->reduction_req_list);
															
@@ -570,6 +598,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 	_starpu_spin_unlock(&handle->header_lock);
															
 
																 	_starpu_spin_destroy(&handle->header_lock);
															
 
																+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->busy_mutex);
															
 
																+	_STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
															
 
																+
															
 
																 	free(handle);
															
 
																 }
															
@@ -604,13 +636,22 @@ static void _starpu_data_invalidate(void *data)
 
																 	{
															
 
																 		struct _starpu_data_replicate *local = &handle->per_node[node];
															
 
																-		if (local->allocated && local->automatically_allocated)
															
 
																-		{
															
 
																+		if (local->mc && local->allocated && local->automatically_allocated)
															
 
																 			/* free the data copy in a lazy fashion */
															
 
																-			_starpu_request_mem_chunk_removal(handle, node, size);
															
 
																-			local->allocated = 0;
															
 
																-			local->automatically_allocated = 0;
															
 
																-		}
															
 
																+			_starpu_request_mem_chunk_removal(handle, local, node, size);
															
 
																+
															
 
																+		local->state = STARPU_INVALID;
															
 
																+	}
															
 
																+
															
 
																+	unsigned worker;
															
 
																+	unsigned nworkers = starpu_worker_get_count();
															
 
																+	for (worker = 0; worker < nworkers; worker++)
															
 
																+	{
															
 
																+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
															
 
																+
															
 
																+		if (local->mc && local->allocated && local->automatically_allocated)
															
 
																+			/* free the data copy in a lazy fashion */
															
 
																+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
															
 
																 		local->state = STARPU_INVALID;
															
 
																 	}
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -291,7 +291,7 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 
																 	uint32_t ny = matrix_interface->ny;
															
 
																 	size_t elemsize = matrix_interface->elemsize;
															
 
																-	starpu_free_on_node(node, matrix_interface->ptr, nx*ny*elemsize);
															
 
																+	starpu_free_on_node(node, matrix_interface->dev_handle, nx*ny*elemsize);
															
 
																 }
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -212,7 +212,7 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 
																 	uint32_t nx = vector_interface->nx;
															
 
																 	size_t elemsize = vector_interface->elemsize;
															
 
																-	starpu_free_on_node(node, vector_interface->ptr, nx*elemsize);
															
 
																+	starpu_free_on_node(node, vector_interface->dev_handle, nx*elemsize);
															
 
																 }
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node,
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -251,11 +251,6 @@ static struct starpu_codelet free_pinned_cl =
 
																 int starpu_free_flags(void *A, size_t dim, int flags)
															
 
																 {
															
 
																-	if (flags & STARPU_MALLOC_COUNT)
															
 
																-	{
															
 
																-		_starpu_memory_manager_deallocate_size(dim, 0);
															
 
																-	}
															
 
																-
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 	if (flags & STARPU_MALLOC_PINNED)
															
 
																 	{
															
@@ -272,7 +267,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
																 				cudaError_t err = cudaFreeHost(A);
															
 
																 				if (STARPU_UNLIKELY(err))
															
 
																 					STARPU_CUDA_REPORT_ERROR(err);
															
 
																-				return 0;
															
 
																+				goto out;
															
 
																 #ifndef HAVE_CUDA_MEMCPY_PEER
															
 
																 			}
															
 
																 			else
															
@@ -293,7 +288,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
																 				push_res = _starpu_task_submit_internally(task);
															
 
																 				STARPU_ASSERT(push_res != -ENODEV);
															
 
																-				return 0;
															
 
																+				goto out;
															
 
																 			}
															
 
																 #endif /* HAVE_CUDA_MEMCPY_PEER */
															
 
																 #endif /* STARPU_USE_CUDA */
															
@@ -317,13 +312,20 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
																 //
															
 
																 //		push_res = starpu_task_submit(task);
															
 
																 //		STARPU_ASSERT(push_res != -ENODEV);
															
 
																-//		return 0;
															
 
																+//		goto out;
															
 
																 //	}
															
 
																 //#endif
															
 
																 	}
															
 
																 #endif /* STARPU_SIMGRID */
															
 
																 	free(A);
															
 
																+
															
 
																+out:
															
 
																+	if (flags & STARPU_MALLOC_COUNT)
															
 
																+	{
															
 
																+		_starpu_memory_manager_deallocate_size(dim, 0);
															
 
																+	}
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -62,6 +62,8 @@ void _starpu_deinit_mem_chunk_lists(void)
 
																 		_starpu_mem_chunk_list_delete(mc_list[i]);
															
 
																 		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
															
 
																 		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
															
 
																+		_starpu_spin_destroy(&lru_rwlock[i]);
															
 
																+		_STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
															
 
																 	}
															
 
																 }
															
@@ -694,59 +696,45 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
 
																  * unregister or unpartition). It puts all the memchunks that refer to the
															
 
																  * specified handle into the cache.
															
 
																  */
															
 
																-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size)
															
 
																+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size)
															
 
																 {
															
 
																-	_starpu_spin_checklocked(&handle->header_lock);
															
 
																+	struct _starpu_mem_chunk *mc = replicate->mc;
															
 
																-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																+	STARPU_ASSERT(mc->data == handle);
															
 
																-	/* TODO: expensive, handle should have its own list of chunks? */
															
 
																-	/* iterate over the list of memory chunks and remove the entry */
															
 
																-	struct _starpu_mem_chunk *mc, *next_mc;
															
 
																-	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
															
 
																-	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
															
 
																-	     mc = next_mc)
															
 
																-	{
															
 
																-		next_mc = _starpu_mem_chunk_list_next(mc);
															
 
																+	/* Record the allocated size, so that later in memory
															
 
																+	 * reclaiming we can estimate how much memory we free
															
 
																+	 * by freeing this.  */
															
 
																+	mc->size = size;
															
 
																-		if (mc->data == handle)
															
 
																-		{
															
 
																-			/* we found the data */
															
 
																+	/* This memchunk doesn't have to do with the data any more. */
															
 
																+	replicate->mc = NULL;
															
 
																+	replicate->allocated = 0;
															
 
																+	replicate->automatically_allocated = 0;
															
 
																-			/* Record the allocated size, so that later in memory
															
 
																-			 * reclaiming we can estimate how much memory we free
															
 
																-			 * by freeing this.  */
															
 
																-			mc->size = size;
															
 
																-			/* This memchunk doesn't have to do with the data any more. */
															
 
																-			mc->data = NULL;
															
 
																+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
															
 
																-			/* remove it from the main list */
															
 
																-			_starpu_mem_chunk_list_erase(mc_list[node], mc);
															
 
																+	mc->data = NULL;
															
 
																+	/* remove it from the main list */
															
 
																+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
															
 
																-			/* We would never flush the node 0 cache, unless
															
 
																-			 * malloc() returns NULL, which is very unlikely... */
															
 
																-			/* This is particularly important when
															
 
																-			 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
															
 
																-			 * wouldn't even re-use these allocations! */
															
 
																-			if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
															
 
																-			{
															
 
																-				free_memory_on_node(mc, node);
															
 
																+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																-				free(mc->chunk_interface);
															
 
																-				_starpu_mem_chunk_delete(mc);
															
 
																-			}
															
 
																-			else
															
 
																-				/* put it in the list of buffers to be removed */
															
 
																-				_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
															
 
																+	/* We would never flush the node 0 cache, unless
															
 
																+	 * malloc() returns NULL, which is very unlikely... */
															
 
																+	/* This is particularly important when
															
 
																+	 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
															
 
																+	 * wouldn't even re-use these allocations! */
															
 
																+	if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
															
 
																+	{
															
 
																+		free_memory_on_node(mc, node);
															
 
																-			/* Note that we do not stop here because there can be
															
 
																-			 * multiple replicates associated to the same handle on
															
 
																-			 * the same memory node.  */
															
 
																-		}
															
 
																+		free(mc->chunk_interface);
															
 
																+		_starpu_mem_chunk_delete(mc);
															
 
																 	}
															
 
																-
															
 
																-	/* there was no corresponding buffer ... */
															
 
																-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
															
 
																+	else
															
 
																+		/* put it in the list of buffers to be removed */
															
 
																+		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
															
 
																 }
															
 
																 /*
															
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -62,7 +62,7 @@ LIST_TYPE(_starpu_mem_chunk_lru,
 
																 void _starpu_init_mem_chunk_lists(void);
															
 
																 void _starpu_deinit_mem_chunk_lists(void);
															
 
																-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size);
															
 
																+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
															
 
																 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
															
 
																 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
															
 
																 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
															
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -55,6 +55,7 @@ void _starpu_memory_nodes_deinit(void)
 
																 	_starpu_deinit_data_request_lists();
															
 
																 	_starpu_deinit_mem_chunk_lists();
															
 
																+	_STARPU_PTHREAD_RWLOCK_DESTROY(&descr.conditions_rwlock);
															
 
																 	_STARPU_PTHREAD_KEY_DELETE(memory_node_key);
															
 
																 }
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -239,11 +239,12 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
																 		.handle = handle,
															
 
																 		.mode = mode,
															
 
																 		.node = node,
															
 
																-		.cond = _STARPU_PTHREAD_COND_INITIALIZER,
															
 
																-		.lock = _STARPU_PTHREAD_MUTEX_INITIALIZER,
															
 
																 		.finished = 0
															
 
																 	};
															
 
																+	_STARPU_PTHREAD_COND_INIT(&wrapper.cond, NULL);
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper.lock, NULL);
															
 
																+
															
 
																 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
															
 
																 	int sequential_consistency = handle->sequential_consistency;
															
@@ -297,8 +298,9 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
																 		while (!wrapper.finished)
															
 
																 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
															
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
															
 
																-		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
															
 
																 	}
															
 
																+	_STARPU_PTHREAD_COND_DESTROY(&wrapper.cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
															
 
																 	/* At that moment, the caller holds a reference to the piece of data.
															
 
																 	 * We enqueue the "post" sync task in the list associated to the handle
															
@@ -381,6 +383,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
																 		/* we can immediately proceed */
															
 
																 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
															
 
																+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
															
 
																+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
															
 
																 		free(wrapper);
															
 
																 		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
															
@@ -410,6 +414,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
																 		while (!wrapper->finished)
															
 
																 			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
															
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
															
 
																+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
															
 
																+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
															
 
																 		free(wrapper);
															
 
																 	}
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -86,7 +86,7 @@ static unsigned get_colour_symbol_blue(char *name)
 
																 }
															
 
																 static double last_codelet_start[STARPU_NMAXWORKERS];
															
 
																-static char last_codelet_symbol[128][STARPU_NMAXWORKERS];
															
 
																+static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
															
 
																 /* If more than a period of time has elapsed, we flush the profiling info,
															
 
																  * otherwise they are accumulated everytime there is a new relevant event. */
															
@@ -144,6 +144,8 @@ static void register_worker_id(unsigned long tid, int workerid)
 
																 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
															
 
																+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase the maximum number of CPUs and GPUs to the same value as was used for execution");
															
 
																+
															
 
																 	/* only register a thread once */
															
 
																 	STARPU_ASSERT(entry == NULL);
															
@@ -506,7 +508,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
																 	unsigned long has_name = ev->param[3];
															
 
																 	char *name = has_name?(char *)&ev->param[4]:"unknown";
															
 
																-	snprintf(last_codelet_symbol[worker], 128, "%s", name);
															
 
																+	snprintf(last_codelet_symbol[worker], sizeof(last_codelet_symbol[worker]), "%s", name);
															
 
																 	double start_codelet_time = get_event_time_stamp(ev, options);
															
 
																 	last_codelet_start[worker] = start_codelet_time;
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -192,7 +192,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
																 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
															
 
																 {
															
 
																 	size_t global_mem;
															
 
																-	int limit;
															
 
																+	ssize_t limit;
															
 
																 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
															
 
																 #ifdef STARPU_DEVEL
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -73,7 +73,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
																  */
															
 
																 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
															
 
																 {
															
 
																-	int limit;
															
 
																+	ssize_t limit;
															
 
																 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
															
 
																 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
															
 
																 	char name[30];
															
@@ -101,8 +101,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
																 	props[devid].totalGlobalMem -= to_waste;
															
 
																 #endif /* STARPU_USE_CUDA */
															
 
																-	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
															
 
																-			devid, (long) to_waste/(1024*1024), limit, (long) totalGlobalMem/(1024*1024),
															
 
																+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
															
 
																+			devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
															
 
																 			(long) (totalGlobalMem - to_waste)/(1024*1024));
															
 
																 }
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -174,11 +174,12 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
																 		if (_starpu_worker_can_block(memnode))
															
 
																 			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
															
 
																-#ifdef STARPU_SIMGRID
															
 
																 		else
															
 
																 		{
															
 
																 			if (_starpu_machine_is_running())
															
 
																 			{
															
 
																+				STARPU_UYIELD();
															
 
																+#ifdef STARPU_SIMGRID
															
 
																 				static int warned;
															
 
																 				if (!warned)
															
 
																 				{
															
@@ -186,9 +187,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
																 					_STARPU_DISP("Has to make simgrid spin for progression hooks\n");
															
 
																 				}
															
 
																 				MSG_process_sleep(0.000010);
															
 
																+#endif
															
 
																 			}
															
 
																 		}
															
 
																-#endif
															
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -61,7 +61,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
																 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
															
 
																 {
															
 
																-	int limit;
															
 
																+	ssize_t limit;
															
 
																 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
															
 
																 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
															
 
																 	char name[30];
															
@@ -90,9 +90,9 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 
																 	to_waste = totalGlobalMem - global_mem[devid];
															
 
																 #endif
															
 
																-	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
															
 
																-                      devid, (size_t)to_waste/(1024*1024), limit, (size_t)totalGlobalMem/(1024*1024),
															
 
																-                      (size_t)(totalGlobalMem - to_waste)/(1024*1024));
															
 
																+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
															
 
																+			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
															
 
																+			(long)(totalGlobalMem - to_waste)/(1024*1024));
															
 
																 }
															
@@ -701,11 +701,19 @@ int _starpu_opencl_driver_deinit(struct starpu_driver *d)
 
																 	unsigned memnode = args->memory_node;
															
 
																 	_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+
															
 
																+	/* In case there remains some memory that was automatically
															
 
																+	 * allocated by StarPU, we release it now. Note that data
															
 
																+	 * coherency is not maintained anymore at that point ! */
															
 
																+	_starpu_free_all_automatically_allocated_buffers(memnode);
															
 
																+
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 	unsigned devid   = args->devid;
															
 
																         _starpu_opencl_deinit_context(devid);
															
 
																 #endif
															
 
																+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -68,8 +68,10 @@ int _starpu_profiling =
 
																 int starpu_profiling_status_set(int status)
															
 
																 {
															
 
																+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
															
 
																 	int prev_value = _starpu_profiling;
															
 
																 	_starpu_profiling = status;
															
 
																+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
															
 
																 	_STARPU_TRACE_SET_PROFILING(status);
															
@@ -94,12 +96,6 @@ int starpu_profiling_status_set(int status)
 
																 	return prev_value;
															
 
																 }
															
 
																-#undef starpu_profiling_status_get
															
 
																-int starpu_profiling_status_get(void)
															
 
																-{
															
 
																-	return _starpu_profiling;
															
 
																-}
															
 
																-
															
 
																 void _starpu_profiling_init(void)
															
 
																 {
															
 
																 	int worker;
															
@@ -110,7 +106,11 @@ void _starpu_profiling_init(void)
 
																 		_starpu_worker_reset_profiling_info(worker);
															
 
																 	}
															
 
																 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
															
 
																+	{
															
 
																+		ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
															
 
																 		_starpu_profiling = STARPU_PROFILING_ENABLE;
															
 
																+		ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
															
 
																+	}
															
 
																 }
															
 
																 void _starpu_profiling_terminate(void)
															
@@ -127,7 +127,7 @@ struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(str
 
																 	struct starpu_task_profiling_info *info = NULL;
															
 
																 	/* If we are benchmarking, we need room for the power consumption */
															
 
																-	if (_starpu_profiling || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
															
 
																+	if (starpu_profiling_status_get() || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
															
 
																 	{
															
 
																 		info = (struct starpu_task_profiling_info *) calloc(1, sizeof(struct starpu_task_profiling_info));
															
 
																 		STARPU_ASSERT(info);
															
@@ -191,7 +191,7 @@ void _starpu_worker_reset_profiling_info(int workerid)
 
																 void _starpu_worker_restart_sleeping(int workerid)
															
 
																 {
															
 
																-	if (_starpu_profiling)
															
 
																+	if (starpu_profiling_status_get())
															
 
																 	{
															
 
																 		struct timespec sleep_start_time;
															
 
																 		_starpu_clock_gettime(&sleep_start_time);
															
@@ -205,7 +205,7 @@ void _starpu_worker_restart_sleeping(int workerid)
 
																 void _starpu_worker_stop_sleeping(int workerid)
															
 
																 {
															
 
																-	if (_starpu_profiling)
															
 
																+	if (starpu_profiling_status_get())
															
 
																 	{
															
 
																 		struct timespec *sleeping_start, sleep_end_time;
															
@@ -240,7 +240,7 @@ void _starpu_worker_stop_sleeping(int workerid)
 
																 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start)
															
 
																 {
															
 
																-	if (_starpu_profiling)
															
 
																+	if (starpu_profiling_status_get())
															
 
																 	{
															
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
															
 
																 		worker_registered_executing_start[workerid] = 1;
															
@@ -252,7 +252,7 @@ void _starpu_worker_register_executing_start_date(int workerid, struct timespec
 
																 void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
															
 
																 {
															
 
																-	if (_starpu_profiling)
															
 
																+	if (starpu_profiling_status_get())
															
 
																 	{
															
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
															
@@ -272,7 +272,7 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
																 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
															
 
																 {
															
 
																-	if (!_starpu_profiling)
															
 
																+	if (!starpu_profiling_status_get())
															
 
																 	{
															
 
																 		/* Not thread safe, shouldn't be too much a problem */
															
 
																 		info->executed_tasks = worker_info[workerid].executed_tasks;
															
@@ -319,7 +319,7 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 
																 /* When did the task reach the scheduler  ? */
															
 
																 void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
															
 
																 {
															
 
																-	if (!_starpu_profiling)
															
 
																+	if (!starpu_profiling_status_get())
															
 
																 		return;
															
 
																 	struct starpu_task_profiling_info *profiling_info;
															
@@ -331,7 +331,7 @@ void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 
																 void _starpu_profiling_set_task_push_end_time(struct starpu_task *task)
															
 
																 {
															
 
																-	if (!_starpu_profiling)
															
 
																+	if (!starpu_profiling_status_get())
															
 
																 		return;
															
 
																 	struct starpu_task_profiling_info *profiling_info;
															
@@ -429,3 +429,13 @@ void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
 
																 	bus_profiling_info[src_node][dst_node].transfer_count++;
															
 
																 //	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
															
 
																 }
															
 
																+
															
 
																+#undef starpu_profiling_status_get
															
 
																+int starpu_profiling_status_get(void)
															
 
																+{
															
 
																+	int ret;
															
 
																+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
															
 
																+	ret = _starpu_profiling;
															
 
																+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
															
 
																+	return ret;
															
 
																+}
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -103,6 +103,7 @@ noinst_PROGRAMS =				\
 
																 	main/deprecated_buffer			\
															
 
																 	main/driver_api/init_run_deinit         \
															
 
																 	main/driver_api/run_driver              \
															
 
																+	main/deploop                            \
															
 
																 	main/restart				\
															
 
																 	main/execute_on_a_specific_worker	\
															
 
																 	main/insert_task			\
															
--- a/tests/loader.c
+++ b/tests/loader.c
@@ -120,7 +120,7 @@ static void test_cleaner(int sig)
 
																 	fprintf(stderr, "[error] test %s has been blocked for %d seconds. Mark it as failed\n", test_name, timeout);
															
 
																 	child_gid = getpgid(child_pid);
															
 
																 	launch_gdb(test_name);
															
 
																-	kill(-child_gid, SIGKILL);
															
 
																+	kill(-child_gid, SIGQUIT);
															
 
																 	exit(EXIT_FAILURE);
															
 
																 }
															
--- a/tests/main/deploop.c
+++ b/tests/main/deploop.c
@@ -0,0 +1,92 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * Create task A and B such that
															
 
																+ * - B depends on A by tag dependency.
															
 
																+ * - A would depend on B by data dependency, but we disable that.
															
 
																+ */
															
 
																+
															
 
																+#include <pthread.h>
															
 
																+#include <stdio.h>
															
 
																+#include <unistd.h>
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "../helper.h"
															
 
																+
															
 
																+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
															
 
																+{
															
 
																+	FPRINTF(stderr,"executing task %p\n", starpu_task_get_current());
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet dummy_codelet = 
															
 
																+{
															
 
																+	.cpu_funcs = {dummy_func, NULL},
															
 
																+	.cuda_funcs = {dummy_func, NULL},
															
 
																+	.opencl_funcs = {dummy_func, NULL},
															
 
																+	.model = NULL,
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = { STARPU_RW }
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret;
															
 
																+	starpu_data_handle_t handle;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	starpu_void_data_register(&handle);
															
 
																+
															
 
																+	struct starpu_task *taskA, *taskB;
															
 
																+
															
 
																+	/* Make B depend on A */
															
 
																+	starpu_tag_declare_deps(1, 1, (starpu_tag_t) 0);
															
 
																+
															
 
																+	taskA = starpu_task_create();
															
 
																+	taskA->cl = &dummy_codelet;
															
 
																+	taskA->tag_id = 0;
															
 
																+	taskA->use_tag = 1;
															
 
																+	taskA->handles[0] = handle;
															
 
																+	taskA->sequential_consistency = 0;
															
 
																+	FPRINTF(stderr,"A is %p\n", taskA);
															
 
																+
															
 
																+	taskB = starpu_task_create();
															
 
																+	taskB->cl = &dummy_codelet;
															
 
																+	taskB->tag_id = 1;
															
 
																+	taskB->use_tag = 1;
															
 
																+	taskB->handles[0] = handle;
															
 
																+	FPRINTF(stderr,"B is %p\n", taskB);
															
 
																+
															
 
																+	ret = starpu_task_submit(taskB);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	ret = starpu_task_submit(taskA);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+
															
 
																+	ret = starpu_task_wait_for_all();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
															
 
																+
															
 
																+	starpu_data_unregister(handle);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
--- a/tests/microbenchs/tasks_overhead.c
+++ b/tests/microbenchs/tasks_overhead.c
@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
															
 
																 	gettimeofday(&end_exec, NULL);
															
 
																-	for (i = 1; i < ntasks; i++)
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																 		starpu_task_clean(&tasks[i]);
															
 
																 	for (buffer = 0; buffer < nbuffers; buffer++)
															
--- a/tools/valgrind/starpu.suppr
+++ b/tools/valgrind/starpu.suppr
@@ -1,18 +1,4 @@
 
																 {
															
 
																-   config.running is not racy from starpu_shutdown
															
 
																-   Helgrind:Race
															
 
																-   fun:starpu_shutdown
															
 
																-   ...
															
 
																-}
															
 
																-
															
 
																-{
															
 
																-   config.running is not racy from _starpu_machine_is_running
															
 
																-   Helgrind:Race
															
 
																-   fun:_starpu_machine_is_running
															
 
																-   ...
															
 
																-}
															
 
																-
															
 
																-{
															
 
																    don't care about cache hit stats
															
 
																    Helgrind:Race
															
 
																    fun:_starpu_msi_cache_hit
															
@@ -48,37 +34,31 @@
 
																 }
															
 
																 {
															
 
																-   We do not care about the race on the entry->mean variable, we only want a good-enough estimation.
															
 
																-   Helgrind:Race
															
 
																-   fun: _starpu_history_based_job_expected_perf
															
 
																-   ...
															
 
																-}
															
 
																-
															
 
																-{
															
 
																    We do not care about races on profiling statistics
															
 
																    Helgrind:Race
															
 
																-   fun: starpu_profiling_status_get
															
 
																+   fun:_starpu_worker_get_status
															
 
																+   fun:_starpu_worker_reset_profiling_info_with_lock
															
 
																    ...
															
 
																 }
															
 
																 {
															
 
																    This is racy, but since we'll always put the same values, this is not a problem.
															
 
																    Helgrind:Race
															
 
																-   fun: _starpu_codelet_check_deprecated_fields
															
 
																+   fun:_starpu_codelet_check_deprecated_fields
															
 
																    ...
															
 
																 }
															
 
																 {
															
 
																    This is racy, but we don't care, it's only a statistic
															
 
																    Helgrind:Race
															
 
																-   fun: starpu_task_nsubmitted
															
 
																+   fun:starpu_task_nsubmitted
															
 
																    ...
															
 
																 }
															
 
																 {
															
 
																    This is racy, but we don't care, it's only a statistic
															
 
																    Helgrind:Race
															
 
																-   fun: starpu_task_nready
															
 
																+   fun:starpu_task_nready
															
 
																    ...
															
 
																 }
															
@@ -92,18 +72,27 @@
 
																 }
															
 
																 {
															
 
																-   This is racy, but we don't care, if the function was called a bit earlier we would have had a different value
															
 
																+   This is racy, but keep it away for now, otherwise it clutters the buildbot log
															
 
																    Helgrind:Race
															
 
																-   fun: _starpu_fifo_empty
															
 
																-   fun: pop_task_eager_policy
															
 
																+   fun:_starpu_fifo_empty
															
 
																+   fun:pop_task_eager_policy
															
 
																    ...
															
 
																 }
															
 
																 {
															
 
																    This is the counterpart of the suppression above
															
 
																    Helgrind:Race
															
 
																-   fun: _starpu_fifo_push_task
															
 
																-   fun: push_task_eager_policy
															
 
																+   fun:_starpu_fifo_push_task
															
 
																+   fun:push_task_eager_policy
															
 
																    ...
															
 
																 }
															
 
																+
															
 
																+{
															
 
																+   This is the counterpart of the suppression above
															
 
																+   Helgrind:Race
															
 
																+   fun:_starpu_fifo_push_sorted_task
															
 
																+   fun:_starpu_fifo_push_task
															
 
																+   fun:push_task_eager_policy
															
 
																+   ...
															
 
																+}