Marc Sergent лет назад: 13
Родитель
Сommit
a007c4d009
46 измененных файлов с 617 добавлено и 259 удалено
  1. 3 0
      ChangeLog
  2. 8 0
      configure.ac
  3. 7 1
      doc/chapters/basic-api.texi
  4. 5 1
      doc/chapters/configuration.texi
  5. 6 5
      examples/tag_example/tag_restartable.c
  6. 9 2
      include/starpu_profiling.h
  7. 5 0
      include/starpu_task.h
  8. 9 1
      include/starpu_util.h
  9. 6 6
      mpi/examples/Makefile.am
  10. 5 3
      mpi/examples/stencil/stencil5.c
  11. 18 18
      mpi/src/starpu_mpi.c
  12. 25 35
      mpi/src/starpu_mpi_collective.c
  13. 3 0
      mpi/tests/mpi_probe.c
  14. 8 8
      mpi/tests/user_defined_datatype.c
  15. 108 30
      sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
  16. 3 0
      src/common/starpu_spinlock.c
  17. 49 1
      src/common/utils.h
  18. 4 1
      src/core/dependencies/implicit_data_deps.c
  19. 12 5
      src/core/dependencies/tags.c
  20. 12 0
      src/core/perfmodel/perfmodel_history.c
  21. 1 1
      src/core/sched_ctx.c
  22. 8 0
      src/core/task.c
  23. 7 1
      src/core/workers.c
  24. 11 0
      src/datawizard/data_request.c
  25. 3 1
      src/datawizard/datawizard.c
  26. 13 11
      src/datawizard/filters.c
  27. 1 1
      src/datawizard/interfaces/block_interface.c
  28. 51 10
      src/datawizard/interfaces/data_interface.c
  29. 1 1
      src/datawizard/interfaces/matrix_interface.c
  30. 1 1
      src/datawizard/interfaces/vector_interface.c
  31. 10 8
      src/datawizard/malloc.c
  32. 31 43
      src/datawizard/memalloc.c
  33. 1 1
      src/datawizard/memalloc.h
  34. 1 0
      src/datawizard/memory_nodes.c
  35. 10 4
      src/datawizard/user_interactions.c
  36. 4 2
      src/debug/traces/starpu_fxt.c
  37. 1 1
      src/drivers/cpu/driver_cpu.c
  38. 3 3
      src/drivers/cuda/driver_cuda.c
  39. 3 2
      src/drivers/driver_common/driver_common.c
  40. 12 4
      src/drivers/opencl/driver_opencl.c
  41. 25 15
      src/profiling/profiling.c
  42. 1 0
      tests/Makefile.am
  43. 1 1
      tests/loader.c
  44. 92 0
      tests/main/deploop.c
  45. 1 1
      tests/microbenchs/tasks_overhead.c
  46. 19 30
      tools/valgrind/starpu.suppr

+ 3 - 0
ChangeLog

@@ -116,6 +116,9 @@ Small features:
   * File STARPU-REVISION --- containing the SVN revision number from which
     StarPU was compiled --- is installed in the share/doc/starpu directory
   * starpu_perfmodel_plot can now directly draw GFlops curves.
+  * New configure option --enable-mpi-progression-hook to enable the
+    activity polling method for StarPU-MPI.
+  * Permit to disable sequential consistency for a given task.
 
 Changes:
   * Fix the block filter functions.

+ 8 - 0
configure.ac

@@ -221,6 +221,7 @@ AM_CONDITIONAL([STARPU_LONG_CHECK], [test "x$enable_long_check" = "xyes"])
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
+AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
@@ -1234,6 +1235,13 @@ if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 fi
 
+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
+				   [Enable StarPU MPI activity polling method])],
+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
+if  test x$enable_mpi_progression_hook = xyes; then
+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
+fi
+
 ###############################################################################
 #                                                                             #
 #                               StarPU-Top                                    #

+ 7 - 1
doc/chapters/basic-api.texi

@@ -1805,9 +1805,15 @@ contained in the @code{tag_id} field. Tag allow the application to synchronize
 with the task and to express task dependencies easily.
 
 @item @code{starpu_tag_t tag_id}
-This fields contains the tag associated to the task if the @code{use_tag} field
+This field contains the tag associated to the task if the @code{use_tag} field
 was set, it is ignored otherwise.
 
+@item @code{unsigned sequential_consistency}
+If this flag is set (which is the default), sequential consistency is enforced
+for the data parameters of this task for which sequential consistency is
+enabled. Clearing this flag permits to disable sequential consistency for this
+task, even if data have it enabled.
+
 @item @code{unsigned synchronous}
 If this flag is set, the @code{starpu_task_submit} function is blocking and
 returns only when the task has been executed (or if no worker is able to

+ 5 - 1
doc/chapters/configuration.texi

@@ -209,10 +209,14 @@ enabled when the GCC compiler provides a plug-in support.
 @end defvr
 
 @defvr {Configure option} --with-mpicc=@var{path}
-Use the @command{mpicc} compiler at @var{path}, for starpumpi
+Use the @command{mpicc} compiler at @var{path}, for StarPU-MPI.
 (@pxref{StarPU MPI support}).
 @end defvr
 
+@defvr {Configure option} --enable-mpi-progression-hook
+Enable the activity polling method for StarPU-MPI.
+@end defvr
+
 @node Advanced configuration
 @subsection Advanced configuration
 

+ 6 - 5
examples/tag_example/tag_restartable.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -106,7 +106,7 @@ static int start_task_grid(unsigned iter)
 	return 0;
 }
 
-void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
+void cpu_codelet(void *descr[] __attribute__((unused)), void *_args __attribute__((unused)))
 {
 /*	int i = (uintptr_t) _args;
 	printf("doing %x\n", i);
@@ -117,7 +117,7 @@ void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
 
 int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 {
-	unsigned i;
+	unsigned i, j;
 	int ret;
 
 	ret = starpu_init(NULL);
@@ -161,8 +161,9 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	FPRINTF(stderr, "TEST DONE ...\n");
 
 enodev:
-	for (i = 0; i < Nrolls; i++)
-	{
+	for (i = 0; i < Nrolls; i++) {
+		for (j = 0; j < ni; j++)
+			starpu_task_destroy(tasks[i][j]);
 		free(tasks[i]);
 	}
 

+ 9 - 2
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,8 +101,15 @@ int starpu_profiling_status_set(int status);
  * error. */
 int starpu_profiling_status_get(void);
 #ifdef BUILDING_STARPU
+#include <common/utils.h>
 extern int _starpu_profiling;
-#define starpu_profiling_status_get() _starpu_profiling
+#define starpu_profiling_status_get() ({ \
+	int __ret; \
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling); \
+	__ret = _starpu_profiling; \
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling); \
+	__ret; \
+})
 #endif
 
 /* Get the profiling info associated to a worker, and reset the profiling

+ 5 - 0
include/starpu_task.h

@@ -129,9 +129,14 @@ struct starpu_task
 	void (*callback_func)(void *);
 	void *callback_arg;
 
+	/* Whether tag_id should be considered */
 	unsigned use_tag;
+	/* Tag associated with this task */
 	starpu_tag_t tag_id;
 
+	/* Whether we should enforce sequential consistency for this task */
+	unsigned sequential_consistency;
+
 	/* options for the task execution */
 	unsigned synchronous; /* if set, a call to push is blocking */
 	int priority; /* STARPU_MAX_PRIO = most important; STARPU_MIN_PRIO = least important */

+ 9 - 1
include/starpu_util.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -186,6 +186,14 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 
+/* This is needed in some places to make valgrind yield to another thread to be
+ * able to progress.  */
+#if defined(__i386__) || defined(__x86_64__)
+#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
+#else
+#define STARPU_UYIELD() ((void)0)
+#endif
+
 #ifdef __cplusplus
 }
 #endif

+ 6 - 6
mpi/examples/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2009-2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -88,7 +88,7 @@ examplebin_PROGRAMS +=				\
 	stencil/stencil5
 
 stencil_stencil5_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
 
 starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5
@@ -106,7 +106,7 @@ examplebin_PROGRAMS += 			\
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 mpi_lu_plu_example_float_SOURCES =	\
 	mpi_lu/plu_example_float.c	\
@@ -118,7 +118,7 @@ mpi_lu_plu_example_float_SOURCES =	\
 mpi_lu_plu_example_double_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 mpi_lu_plu_example_double_SOURCES =	\
 	mpi_lu/plu_example_double.c	\
@@ -148,7 +148,7 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 
 matrix_decomposition_mpi_cholesky_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 	matrix_decomposition/mpi_cholesky_distributed.c	\
@@ -161,7 +161,7 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\

+ 5 - 3
mpi/examples/stencil/stencil5.c

@@ -37,12 +37,14 @@ struct starpu_codelet stencil5_cl =
 };
 
 #ifdef STARPU_QUICK_CHECK
-#  define NITER_DEF	10
+#  define NITER_DEF	5
+#  define X         	3
+#  define Y         	3
 #else
 #  define NITER_DEF	500
+#  define X         	20
+#  define Y         	20
 #endif
-#define X         20
-#define Y         20
 
 int display = 0;
 int niter = NITER_DEF;

+ 18 - 18
mpi/src/starpu_mpi.c

@@ -23,11 +23,7 @@
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_insert_task.h>
-
-#ifdef STARPU_DEVEL
-#  warning TODO find a better way to select the polling method (perhaps during the configuration)
-#endif
-//#define USE_STARPU_ACTIVITY	1
+#include <common/config.h>
 
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
@@ -643,6 +639,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 		MPI_Status status;
 		memset(&status, 0, sizeof(MPI_Status));
 		req->ret = MPI_Recv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &status);
+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 	}
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ || req->request_type == PROBE_REQ)
@@ -699,7 +696,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	_STARPU_MPI_LOG_OUT();
 }
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
 {
 	unsigned may_block = 1;
@@ -714,7 +711,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
 	return may_block;
 }
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 static void _starpu_mpi_test_detached_requests(void)
 {
@@ -885,9 +882,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* shall we block ? */
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 
-#ifndef USE_STARPU_ACTIVITY
+#ifndef STARPU_MPI_ACTIVITY
 		block = block && _starpu_mpi_req_list_empty(detached_requests);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 		if (block)
 		{
@@ -946,20 +943,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 /*                                                      */
 /********************************************************/
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 static int hookid = - 1;
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 static void _starpu_mpi_add_sync_point_in_fxt(void)
 {
 #ifdef STARPU_USE_FXT
 	int rank;
 	int worldsize;
+	int ret;
+
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 
-	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
-	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+	ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 	/* We generate a "unique" key so that we can make sure that different
 	 * FxT traces come from the same MPI run. */
@@ -973,7 +972,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 		random_number = rand();
 	}
 
-	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
 
@@ -1006,10 +1006,10 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	STARPU_ASSERT(hookid >= 0);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
@@ -1058,9 +1058,9 @@ int starpu_mpi_shutdown(void)
 
 	pthread_join(progress_thread, &value);
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 	starpu_progression_hook_deregister(hookid);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 	TRACE_MPI_STOP(rank, world_size);
 

+ 25 - 35
mpi/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,26 +42,23 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 {
 	int rank;
 	int x;
-	struct _callback_arg *callback_arg;
-	void (*callback_func)(void *);
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
 
 	MPI_Comm_rank(comm, &rank);
 
-	callback_func = _callback_collective;
-	callback_arg = malloc(sizeof(struct _callback_arg));
-	callback_arg->count = 0;
-	callback_arg->nb = 0;
-	callback_arg->callback = (rank == root) ? scallback : rcallback;
-	callback_arg->arg = (rank == root) ? sarg : rarg;
-	if (callback_arg->callback == NULL)
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
 	{
-		free(callback_arg);
-		callback_arg = NULL;
-		callback_func = NULL;
-	}
+		callback_func = _callback_collective;
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = (rank == root) ? scallback : rcallback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
+		if (callback_arg->callback == NULL)
 
-	if (callback_arg)
-	{
 		for(x = 0; x < count ; x++)
 		{
 			if (data_handles[x])
@@ -107,29 +104,23 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 {
 	int rank;
 	int x;
-	struct _callback_arg *callback_arg;
-	void (*callback_func)(void *);
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
 
 	MPI_Comm_rank(comm, &rank);
 
-#ifdef STARPU_DEVEL
-#warning TODO: callback_arg needs to be free-ed
-#endif
-	callback_func = _callback_collective;
-	callback_arg = malloc(sizeof(struct _callback_arg));
-	callback_arg->count = 0;
-	callback_arg->nb = 0;
-	callback_arg->callback = (rank == root) ? scallback : rcallback;
-	callback_arg->arg = (rank == root) ? sarg : rarg;
-	if (callback_arg->callback == NULL)
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
 	{
-		free(callback_arg);
-		callback_arg = NULL;
-		callback_func = NULL;
-	}
+		callback_func = _callback_collective;
+
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = callback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
 
-	if (callback_arg)
-	{
 		for(x = 0; x < count ; x++)
 		{
 			if (data_handles[x])
@@ -170,4 +161,3 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 	}
 	return 0;
 }
-

+ 3 - 0
mpi/tests/mpi_probe.c

@@ -45,6 +45,8 @@ void callback(void *arg __attribute__((unused)))
 
 int main(int argc, char **argv)
 {
+	return 77;
+	/*
 	int ret, rank, size;
 
 	MPI_Init(NULL, NULL);
@@ -99,4 +101,5 @@ int main(int argc, char **argv)
 	MPI_Finalize();
 
 	return 0;
+	*/
 }

+ 8 - 8
mpi/tests/user_defined_datatype.c

@@ -24,36 +24,36 @@
 #  define ELEMENTS 1000
 #endif
 
-typedef void (*test_func)(starpu_data_handle_t *, int, int);
+typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
 
-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank)
+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
 {
 	int i;
 
 	for(i=0 ; i<nb_handles ; i++)
 	{
 		starpu_data_set_rank(handles[i], 1);
-		starpu_data_set_tag(handles[i], i+100);
+		starpu_data_set_tag(handles[i], i+tag);
 	}
 
 	for(i=0 ; i<nb_handles ; i++)
 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
 }
 
-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank)
+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
 {
 	int i;
 
 	if (rank == 1)
 	{
 		for(i=0 ; i<nb_handles ; i++)
-			starpu_mpi_send(handles[i], 0, i+100, MPI_COMM_WORLD);
+			starpu_mpi_send(handles[i], 0, i+tag, MPI_COMM_WORLD);
 	}
 	else if (rank == 0)
 	{
 		MPI_Status statuses[nb_handles];
 		for(i=0 ; i<nb_handles ; i++)
-			starpu_mpi_recv(handles[i], 1, i+100, MPI_COMM_WORLD, &statuses[i]);
+			starpu_mpi_recv(handles[i], 1, i+tag, MPI_COMM_WORLD, &statuses[i]);
 	}
 }
 
@@ -126,8 +126,8 @@ int main(int argc, char **argv)
 				starpu_variable_data_register(&handle_vars[i], 0, (uintptr_t)&foo[i], sizeof(double));
 			}
 
-			f(handle_vars, ELEMENTS, rank);
-			f(handle_complex, ELEMENTS, rank);
+			f(handle_vars, ELEMENTS, rank, ELEMENTS);
+			f(handle_complex, ELEMENTS, rank, 4*ELEMENTS);
 
 			for(i=0 ; i<ELEMENTS ; i++)
 			{

+ 108 - 30
sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c

@@ -21,8 +21,10 @@
 static struct bound_task_pool *task_pools = NULL;
 
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger);
-static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger,
+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs);
+static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], 
+						     int *sched_ctxs, int *workers, struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
 {
 	double draft_tasks[nw][nt];
 	double draft_w_in_s[ns][nw];
@@ -45,7 +47,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 	/* smallest possible tmax, difficult to obtain as we
 	   compute the nr of flops and not the tasks */
 	double possible_tmax = _lp_get_tmax(nw, workers);
-	double smallest_tmax = possible_tmax / 2;
+	double smallest_tmax = possible_tmax / 3;
 	double tmax = possible_tmax * ns;
 	double res = 1.0;
 	unsigned has_sol = 0;
@@ -53,6 +55,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 	double old_tmax = 0.0;
 	unsigned found_sol = 0;
 
+//	printf("tmin = %lf tmax = %lf \n", tmin, tmax);
 	struct timeval start_time;
 	struct timeval end_time;
 	int nd = 0;
@@ -65,7 +68,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 		/* find solution and save the values in draft tables
 		   only if there is a solution for the system we save them
 		   in the proper table */
-		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1);
+		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1, tmp_task_pools, size_ctxs);
 		if(res != 0.0)
 		{
 			for(w = 0; w < nw; w++)
@@ -129,7 +132,7 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
 	double w_in_s[ns][nw];
 	double tasks[nw][nt];
-	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers);
+	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers, task_pools, 1);
 	pthread_mutex_unlock(&mutex);
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
@@ -194,7 +197,6 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 {
 	/* count the tasks of the same type */
-	pthread_mutex_lock(&mutex);
 	struct bound_task_pool *tp = NULL;
 
 	for (tp = task_pools; tp; tp = tp->next)
@@ -209,20 +211,36 @@ static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 			tp->n--;
 		else
 		{
-			struct bound_task_pool *prev_tp = NULL;
-			for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
+			if(tp == task_pools)
 			{
-				if (prev_tp->next == tp)
-					prev_tp->next = tp->next;
+				struct bound_task_pool *next_tp = NULL;
+				if(task_pools->next)
+					next_tp = task_pools->next;
+
+				free(tp);
+				tp = NULL;
+				
+				if(next_tp)
+					task_pools = next_tp;
+				
+			}
+			else
+			{
+				struct bound_task_pool *prev_tp = NULL;
+				for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
+				{
+					if (prev_tp->next == tp)
+						prev_tp->next = tp->next;
+				}
+				
+				free(tp);
+				tp = NULL;
 			}
-
-			free(tp);
 		}
 	}
-	pthread_mutex_unlock(&mutex);
 }
 
-static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
+static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs)
 {
         struct bound_task_pool *tp;
         int w, t;
@@ -230,14 +248,33 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
         {
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
-                        enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_perf_archtype(w) :
-				starpu_worker_get_perf_archtype(workers[w]);
+			int worker = workers == NULL ? w : workers[w];
+                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(worker);
                         double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
                         if (isnan(length))
                                 times[w][t] = NAN;
-                       else
+			else
+			{
                                 times[w][t] = length / 1000.;
+
+				double transfer_time = 0.0;
+				enum starpu_archtype arch = starpu_worker_get_type(worker);
+				if(arch == STARPU_CUDA_WORKER)
+				{
+					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
+					if(!worker_in_ctx && !size_ctxs)
+					{
+						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
+						transfer_time +=  (tp->footprint / transfer_velocity) / 1000. ;
+					}
+					double latency = starpu_get_latency_RAM_CUDA(worker);
+					transfer_time += latency/1000.;
+
+				}
+//				printf("%d/%d %s x %d time = %lf transfer_time = %lf\n", w, tp->sched_ctx_id, tp->cl->model->symbol, tp->n, times[w][t], transfer_time);
+				times[w][t] += transfer_time;
+			}
                 }
         }
 }
@@ -247,9 +284,10 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
  */
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer)
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer,
+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
 {
-	if(task_pools == NULL)
+	if(tmp_task_pools == NULL)
 		return 0.0;
 	struct bound_task_pool * tp;
 	int t, w, s;
@@ -270,7 +308,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 		int ia[ne], ja[ne];
 		double ar[ne];
 
-		_get_tasks_times(nw, nt, times, workers);
+		_get_tasks_times(nw, nt, times, workers, size_ctxs);
 
 		/* Variables: number of tasks i assigned to worker j, and tmax */
 		glp_add_cols(lp, nw*nt+ns*nw);
@@ -280,7 +318,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
 
 		for (w = 0; w < nw; w++)
-			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+			for (t = 0; t < nt; t++)
 			{
 				char name[32];
 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
@@ -313,7 +351,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 		int curr_row_idx = 0;
 		/* Total worker execution time */
 		glp_add_rows(lp, nw*ns);
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0; t < nt; t++)
 		{
 			int someone = 0;
 			for (w = 0; w < nw; w++)
@@ -336,7 +374,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				starpu_worker_get_name(w, name, sizeof(name));
 				snprintf(title, sizeof(title), "worker %s", name);
 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
-				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+				for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
 				{
 					if((int)tp->sched_ctx_id == sched_ctxs[s])
 					{
@@ -362,7 +400,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
 		/* Total task completion */
 		glp_add_rows(lp, nt);
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
 		{
 			char name[32], title[64];
 			starpu_worker_get_name(w, name, sizeof(name));
@@ -411,6 +449,12 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	glp_init_smcp(&parm);
 	parm.msg_lev = GLP_MSG_OFF;
 	int ret = glp_simplex(lp, &parm);
+
+/* 	char str[50]; */
+/* 	sprintf(str, "outpu_lp_%g", tmax); */
+
+/* 	glp_print_sol(lp, str); */
+
 	if (ret)
 	{
 		printf("error in simplex\n");
@@ -449,7 +493,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
 	double res = glp_get_obj_val(lp);
 	for (w = 0; w < nw; w++)
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0; t < nt; t++)
 /* 			if (integer) */
 /* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
 /*                         else */
@@ -471,10 +515,18 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	return res;
 }
 
+static struct bound_task_pool* _clone_linked_list(struct bound_task_pool *tp)
+{
+	if(tp == NULL) return NULL;
+
+	struct bound_task_pool *tmp_tp = (struct bound_task_pool*)malloc(sizeof(struct bound_task_pool));
+	memcpy(tmp_tp, tp, sizeof(struct bound_task_pool));
+	tmp_tp->next = _clone_linked_list(tp->next);
+	return tmp_tp;
+}
 
 static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
-	_remove_task_from_pool(task, footprint);
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 
 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
@@ -491,24 +543,50 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 			int nt = 0; /* Number of different kinds of tasks */
-			pthread_mutex_lock(&mutex);
-			struct bound_task_pool * tp;
+
+//			pthread_mutex_lock(&mutex);
+
+			/* we don't take the mutex bc a correct value of the number of tasks is
+			   not required but we do a copy in order to be sure
+			   that the linear progr won't segfault if the list of 
+			   submitted task will change during the exec */
+
+			struct bound_task_pool *tp = NULL;
+			struct bound_task_pool *tmp_task_pools = _clone_linked_list(task_pools);
+
 			for (tp = task_pools; tp; tp = tp->next)
 				nt++;
 
+
 			double w_in_s[ns][nw];
 			double tasks_per_worker[nw][nt];
 
-			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
-			pthread_mutex_unlock(&mutex);
+			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL, tmp_task_pools, 0);
+//			pthread_mutex_unlock(&mutex);
+
 			/* if we did find at least one solution redistribute the resources */
 			if(found_sol)
 				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
 
+			struct bound_task_pool *next = NULL;
+			struct bound_task_pool *tmp_tp = tmp_task_pools;
+			while(tmp_task_pools)
+			{
+				next = tmp_tp->next;
+				free(tmp_tp);
+				tmp_tp = next;
+				tmp_task_pools = next;
+			}
+			
 
 		}
 		pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
+	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
+//	pthread_mutex_lock(&mutex);
+	_remove_task_from_pool(task, footprint);
+//	pthread_mutex_unlock(&mutex);
+
 }
 
 

+ 3 - 0
src/common/starpu_spinlock.c

@@ -82,6 +82,7 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 		/* Give hand to another thread, hopefully the one which has the
 		 * spinlock and probably just has also a short-lived mutex. */
 		MSG_process_sleep(0.000001);
+		STARPU_UYIELD();
 	}
 #elif defined(STARPU_SPINLOCK_CHECK)
 	int ret = pthread_mutex_lock(&lock->errcheck_lock);
@@ -96,6 +97,8 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 	do
 	{
 		prev = STARPU_TEST_AND_SET(&lock->taken, 1);
+		if (prev)
+			STARPU_UYIELD();
 	}
 	while (prev);
 	return 0;

+ 49 - 1
src/common/utils.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,6 +31,54 @@
 #include <msg/msg.h>
 #endif
 
+#ifdef STARPU_HAVE_HELGRIND_H
+#include <valgrind/helgrind.h>
+#endif
+
+#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
+#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_LOCK_POST
+#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
+#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
+#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
+#endif
+#ifndef DO_CREQ_v_WW
+#define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
+#endif
+#ifndef DO_CREQ_v_W
+#define DO_CREQ_v_W(_creqF, _ty1F, _arg1F) ((void)0)
+#endif
+#ifndef ANNOTATE_HAPPENS_BEFORE
+#define ANNOTATE_HAPPENS_BEFORE(obj) ((void)0)
+#endif
+#ifndef ANNOTATE_HAPPENS_AFTER
+#define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
+#endif
+#ifndef ANNOTATE_RWLOCK_ACQUIRED
+#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
+#endif
+#ifndef ANNOTATE_RWLOCK_RELEASED
+#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
+#endif
+
+#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
+	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
+			struct _starpu_spinlock *, lock, long, 0)
+#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
+			struct _starpu_spinlock *, lock)
+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
+			struct _starpu_spinlock *, lock)
+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
+			struct _starpu_spinlock *, lock)
+
 #ifdef STARPU_VERBOSE
 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
 #else

+ 4 - 1
src/core/dependencies/implicit_data_deps.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -307,6 +307,9 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	STARPU_ASSERT(task->cl);
         _STARPU_LOG_IN();
 
+	if (!task->sequential_consistency)
+		return;
+
 	/* We don't want to enforce a sequential consistency for tasks that are
 	 * not visible to the application. */
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);

+ 12 - 5
src/core/dependencies/tags.c

@@ -178,10 +178,8 @@ void _starpu_tag_clear(void)
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 }
 
-static struct _starpu_tag *gettag_struct(starpu_tag_t id)
+static struct _starpu_tag *_gettag_struct(starpu_tag_t id)
 {
-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
-
 	/* search if the tag is already declared or not */
 	struct _starpu_tag_table *entry;
 	struct _starpu_tag *tag;
@@ -212,8 +210,15 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 #endif
 	}
 
-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
+	return tag;
+}
 
+static struct _starpu_tag *gettag_struct(starpu_tag_t id)
+{
+	struct _starpu_tag *tag;
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
+	tag = _gettag_struct(id);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 	return tag;
 }
 
@@ -432,10 +437,11 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 		return -EDEADLK;
 	}
 
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 	/* only wait the tags that are not done yet */
 	for (i = 0, current = 0; i < ntags; i++)
 	{
-		struct _starpu_tag *tag = gettag_struct(id[i]);
+		struct _starpu_tag *tag = _gettag_struct(id[i]);
 
 		_starpu_spin_lock(&tag->lock);
 
@@ -450,6 +456,7 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 			current++;
 		}
 	}
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 
 	if (current == 0)
 	{

+ 12 - 0
src/core/perfmodel/perfmodel_history.c

@@ -942,6 +942,7 @@ int starpu_perfmodel_list(FILE *output)
 
 /* This function is intended to be used by external tools that should read the
  * performance model files */
+/* TODO: write an clear function, to free symbol and history */
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 {
 	model->symbol = strdup(symbol);
@@ -1064,6 +1065,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 		HASH_FIND_UINT32_T(history, &key, entry);
 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
+		/* We do not care about racing access to the mean, we only want a
+		 * good-enough estimation, thus simulate taking the rdlock */
+		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
+
 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
 			exp = entry->history_entry->mean;
 		else if (!model->benchmarking)
@@ -1075,6 +1080,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}
+		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
 	}
 
 	return exp;
@@ -1097,6 +1103,10 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	entry = (elt == NULL) ? NULL : elt->history_entry;
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
+	/* We do not care about racing access to the mean, we only want a
+	 * good-enough estimation, thus simulate taking the rdlock */
+	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
+
 	exp = entry?entry->mean:NAN;
 
 	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
@@ -1115,6 +1125,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 		model->benchmarking = 1;
 	}
 
+	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
+
 	return exp;
 }
 

+ 1 - 1
src/core/sched_ctx.c

@@ -490,7 +490,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	unsigned nworkers = config->topology.nworkers;
 
-	if(nworkers_ctx > 0 && inheritor_sched_ctx_id != STARPU_NMAX_SCHED_CTXS && 
+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
 	{
 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);

+ 8 - 0
src/core/task.c

@@ -57,6 +57,8 @@ void starpu_task_init(struct starpu_task *task)
 	 * everywhere */
 	memset(task, 0, sizeof(struct starpu_task));
 
+	task->sequential_consistency = 1;
+
 	/* Now we can initialise fields which recquire custom value */
 #if STARPU_DEFAULT_PRIO != 0
 	task->priority = STARPU_DEFAULT_PRIO;
@@ -707,7 +709,11 @@ void _starpu_decrement_nsubmitted_tasks(void)
 	if (--nsubmitted == 0)
 	{
 		if (!config->submitting)
+		{
+			ANNOTATE_HAPPENS_AFTER(&config->running);
 			config->running = 0;
+			ANNOTATE_HAPPENS_BEFORE(&config->running);
+		}
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 	}
 
@@ -727,7 +733,9 @@ starpu_drivers_request_termination(void)
 	config->submitting = 0;
 	if (nsubmitted == 0)
 	{
+		ANNOTATE_HAPPENS_AFTER(&config->running);
 		config->running = 0;
+		ANNOTATE_HAPPENS_BEFORE(&config->running);
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 	}
 

+ 7 - 1
src/core/workers.c

@@ -893,9 +893,13 @@ out:
 
 unsigned _starpu_machine_is_running(void)
 {
+	unsigned ret;
 	/* running is just protected by a memory barrier */
 	STARPU_RMB();
-	return config.running;
+	ANNOTATE_HAPPENS_AFTER(&config.running);
+	ret = config.running;
+	ANNOTATE_HAPPENS_BEFORE(&config.running);
+	return ret;
 }
 
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
@@ -923,8 +927,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 {
 	/* set the flag which will tell workers to stop */
+	ANNOTATE_HAPPENS_AFTER(&config.running);
 	pconfig->running = 0;
 	/* running is just protected by a memory barrier */
+	ANNOTATE_HAPPENS_BEFORE(&config.running);
 	STARPU_WMB();
 	starpu_wake_all_blocked_workers();
 }

+ 11 - 0
src/datawizard/data_request.c

@@ -17,6 +17,7 @@
 
 #include <starpu.h>
 #include <common/config.h>
+#include <common/utils.h>
 #include <datawizard/datawizard.h>
 
 /* requests that have not been treated at all */
@@ -391,8 +392,18 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 
+	/* Note: we here tell valgrind that list_empty (reading a pointer) is
+	 * as safe as if we had the lock held */
+	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
+	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
+	{
+		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
+		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 		return;
+	}
+	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
+	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 
 	/* take all the entries from the request list */
         _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);

+ 3 - 1
src/datawizard/datawizard.c

@@ -26,12 +26,14 @@
 
 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 {
-#ifdef STARPU_SIMGRID
 #if STARPU_DEVEL
 #warning FIXME
 #endif
+#ifdef STARPU_SIMGRID
 	MSG_process_sleep(0.000010);
 #endif
+	STARPU_UYIELD();
+
 	/* in case some other driver requested data */
 	_starpu_handle_pending_node_data_requests(memory_node);
 	_starpu_handle_node_data_requests(memory_node, may_alloc);

+ 13 - 11
src/datawizard/filters.c

@@ -345,13 +345,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	/* still valid ? */
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
+		struct _starpu_data_replicate *local;
 		/* until an issue is found the data is assumed to be valid */
 		unsigned isvalid = 1;
 
 		for (child = 0; child < root_handle->nchildren; child++)
 		{
 			starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
-			struct _starpu_data_replicate *local = &child_handle->per_node[node];
+			local = &child_handle->per_node[node];
 
 			if (local->state == STARPU_INVALID)
 			{
@@ -359,24 +360,21 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 				isvalid = 0;
 			}
 
-			if (local->allocated && local->automatically_allocated)
-			{
+			if (local->mc && local->allocated && local->automatically_allocated)
 				/* free the child data copy in a lazy fashion */
-#ifdef STARPU_DEVEL
-#warning FIXME!! this needs access to the child interface, which was freed above!
-#endif
-				_starpu_request_mem_chunk_removal(child_handle, node, sizes[child]);
-			}
+				_starpu_request_mem_chunk_removal(child_handle, local, node, sizes[child]);
 		}
 
-		if (!root_handle->per_node[node].allocated)
+		local = &root_handle->per_node[node];
+
+		if (!local->allocated)
 			/* Even if we have all the bits, if we don't have the
 			 * whole data, it's not valid */
 			isvalid = 0;
 
-		if (!isvalid && root_handle->per_node[node].allocated && root_handle->per_node[node].automatically_allocated)
+		if (!isvalid && local->mc && local->allocated && local->automatically_allocated)
 			/* free the data copy in a lazy fashion */
-			_starpu_request_mem_chunk_removal(root_handle, node, _starpu_data_get_size(root_handle));
+			_starpu_request_mem_chunk_removal(root_handle, local, node, _starpu_data_get_size(root_handle));
 
 		/* if there was no invalid copy, the node still has a valid copy */
 		still_valid[node] = isvalid;
@@ -400,6 +398,10 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
 		_starpu_spin_unlock(&child_handle->header_lock);
 		_starpu_spin_destroy(&child_handle->header_lock);
+
+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->busy_mutex);
+		_STARPU_PTHREAD_COND_DESTROY(&child_handle->busy_cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->sequential_consistency_mutex);
 	}
 
 	/* there is no child anymore */

+ 1 - 1
src/datawizard/interfaces/block_interface.c

@@ -315,7 +315,7 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t nz = block_interface->nz;
 	size_t elemsize = block_interface->elemsize;
 
-	starpu_free_on_node(node, block_interface->ptr, nx*ny*nz*elemsize);
+	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
 }
 
 #ifdef STARPU_USE_CUDA

+ 51 - 10
src/datawizard/interfaces/data_interface.c

@@ -480,6 +480,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
 				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
 			}
+			_STARPU_PTHREAD_MUTEX_DESTROY(&arg.mutex);
+			_STARPU_PTHREAD_COND_DESTROY(&arg.cond);
 			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
 		}
 
@@ -546,23 +548,49 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
 	/* Wait for all requests to finish (notably WT requests) */
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
-	while (handle->busy_count)
+	while (1) {
+		int busy;
+		/* Note: we here tell valgrind that reading busy_count is as
+		 * safe is if we had the lock held */
+		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
+		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
+		busy = handle->busy_count;
+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
+		if (!busy)
+			break;
+		/* This is woken by _starpu_data_check_not_busy, always called
+		 * after decrementing busy_count */
 		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
+	}
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
 
 	/* Wait for finished requests to release the handle */
 	_starpu_spin_lock(&handle->header_lock);
 
+	size_t size = _starpu_data_get_size(handle);
+
+	_starpu_data_free_interfaces(handle);
+
 	/* Destroy the data now */
 	unsigned node;
-	size_t size = _starpu_data_get_size(handle);
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
+		struct _starpu_data_replicate *local = &handle->per_node[node];
 		/* free the data copy in a lazy fashion */
-		_starpu_request_mem_chunk_removal(handle, node, size);
+		if (local->allocated && local->automatically_allocated)
+			_starpu_request_mem_chunk_removal(handle, local, node, size);
+	}
+	unsigned worker;
+	unsigned nworkers = starpu_worker_get_count();
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
+		/* free the data copy in a lazy fashion */
+		if (local->allocated && local->automatically_allocated)
+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 	}
 
-	_starpu_data_free_interfaces(handle);
 	_starpu_memory_stats_free(handle);
 	_starpu_data_requester_list_delete(handle->req_list);
 	_starpu_data_requester_list_delete(handle->reduction_req_list);
@@ -570,6 +598,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_destroy(&handle->header_lock);
 
+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->busy_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
+
 	free(handle);
 }
 
@@ -604,13 +636,22 @@ static void _starpu_data_invalidate(void *data)
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
 
-		if (local->allocated && local->automatically_allocated)
-		{
+		if (local->mc && local->allocated && local->automatically_allocated)
 			/* free the data copy in a lazy fashion */
-			_starpu_request_mem_chunk_removal(handle, node, size);
-			local->allocated = 0;
-			local->automatically_allocated = 0;
-		}
+			_starpu_request_mem_chunk_removal(handle, local, node, size);
+
+		local->state = STARPU_INVALID;
+	}
+
+	unsigned worker;
+	unsigned nworkers = starpu_worker_get_count();
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
+
+		if (local->mc && local->allocated && local->automatically_allocated)
+			/* free the data copy in a lazy fashion */
+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 
 		local->state = STARPU_INVALID;
 	}

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -291,7 +291,7 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t ny = matrix_interface->ny;
 	size_t elemsize = matrix_interface->elemsize;
 
-	starpu_free_on_node(node, matrix_interface->ptr, nx*ny*elemsize);
+	starpu_free_on_node(node, matrix_interface->dev_handle, nx*ny*elemsize);
 }
 
 #ifdef STARPU_USE_CUDA

+ 1 - 1
src/datawizard/interfaces/vector_interface.c

@@ -212,7 +212,7 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t nx = vector_interface->nx;
 	size_t elemsize = vector_interface->elemsize;
 
-	starpu_free_on_node(node, vector_interface->ptr, nx*elemsize);
+	starpu_free_on_node(node, vector_interface->dev_handle, nx*elemsize);
 }
 
 static int copy_any_to_any(void *src_interface, unsigned src_node,

+ 10 - 8
src/datawizard/malloc.c

@@ -251,11 +251,6 @@ static struct starpu_codelet free_pinned_cl =
 
 int starpu_free_flags(void *A, size_t dim, int flags)
 {
-	if (flags & STARPU_MALLOC_COUNT)
-	{
-		_starpu_memory_manager_deallocate_size(dim, 0);
-	}
-
 #ifndef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_PINNED)
 	{
@@ -272,7 +267,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 				cudaError_t err = cudaFreeHost(A);
 				if (STARPU_UNLIKELY(err))
 					STARPU_CUDA_REPORT_ERROR(err);
-				return 0;
+				goto out;
 #ifndef HAVE_CUDA_MEMCPY_PEER
 			}
 			else
@@ -293,7 +288,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
 				push_res = _starpu_task_submit_internally(task);
 				STARPU_ASSERT(push_res != -ENODEV);
-				return 0;
+				goto out;
 			}
 #endif /* HAVE_CUDA_MEMCPY_PEER */
 #endif /* STARPU_USE_CUDA */
@@ -317,13 +312,20 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 //
 //		push_res = starpu_task_submit(task);
 //		STARPU_ASSERT(push_res != -ENODEV);
-//		return 0;
+//		goto out;
 //	}
 //#endif
 	}
 #endif /* STARPU_SIMGRID */
 
 	free(A);
+
+out:
+	if (flags & STARPU_MALLOC_COUNT)
+	{
+		_starpu_memory_manager_deallocate_size(dim, 0);
+	}
+
 	return 0;
 }
 

+ 31 - 43
src/datawizard/memalloc.c

@@ -62,6 +62,8 @@ void _starpu_deinit_mem_chunk_lists(void)
 		_starpu_mem_chunk_list_delete(mc_list[i]);
 		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
 		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
+		_starpu_spin_destroy(&lru_rwlock[i]);
+		_STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
 	}
 }
 
@@ -694,59 +696,45 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
  * unregister or unpartition). It puts all the memchunks that refer to the
  * specified handle into the cache.
  */
-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size)
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size)
 {
-	_starpu_spin_checklocked(&handle->header_lock);
+	struct _starpu_mem_chunk *mc = replicate->mc;
 
-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	STARPU_ASSERT(mc->data == handle);
 
-	/* TODO: expensive, handle should have its own list of chunks? */
-	/* iterate over the list of memory chunks and remove the entry */
-	struct _starpu_mem_chunk *mc, *next_mc;
-	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
-	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
-	     mc = next_mc)
-	{
-		next_mc = _starpu_mem_chunk_list_next(mc);
+	/* Record the allocated size, so that later in memory
+	 * reclaiming we can estimate how much memory we free
+	 * by freeing this.  */
+	mc->size = size;
 
-		if (mc->data == handle)
-		{
-			/* we found the data */
+	/* This memchunk doesn't have to do with the data any more. */
+	replicate->mc = NULL;
+	replicate->allocated = 0;
+	replicate->automatically_allocated = 0;
 
-			/* Record the allocated size, so that later in memory
-			 * reclaiming we can estimate how much memory we free
-			 * by freeing this.  */
-			mc->size = size;
-			/* This memchunk doesn't have to do with the data any more. */
-			mc->data = NULL;
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 
-			/* remove it from the main list */
-			_starpu_mem_chunk_list_erase(mc_list[node], mc);
+	mc->data = NULL;
+	/* remove it from the main list */
+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
-			/* We would never flush the node 0 cache, unless
-			 * malloc() returns NULL, which is very unlikely... */
-			/* This is particularly important when
-			 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
-			 * wouldn't even re-use these allocations! */
-			if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
-			{
-				free_memory_on_node(mc, node);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 
-				free(mc->chunk_interface);
-				_starpu_mem_chunk_delete(mc);
-			}
-			else
-				/* put it in the list of buffers to be removed */
-				_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
+	/* We would never flush the node 0 cache, unless
+	 * malloc() returns NULL, which is very unlikely... */
+	/* This is particularly important when
+	 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
+	 * wouldn't even re-use these allocations! */
+	if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
+	{
+		free_memory_on_node(mc, node);
 
-			/* Note that we do not stop here because there can be
-			 * multiple replicates associated to the same handle on
-			 * the same memory node.  */
-		}
+		free(mc->chunk_interface);
+		_starpu_mem_chunk_delete(mc);
 	}
-
-	/* there was no corresponding buffer ... */
-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	else
+		/* put it in the list of buffers to be removed */
+		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
 }
 
 /*

+ 1 - 1
src/datawizard/memalloc.h

@@ -62,7 +62,7 @@ LIST_TYPE(_starpu_mem_chunk_lru,
 
 void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size);
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);

+ 1 - 0
src/datawizard/memory_nodes.c

@@ -55,6 +55,7 @@ void _starpu_memory_nodes_deinit(void)
 	_starpu_deinit_data_request_lists();
 	_starpu_deinit_mem_chunk_lists();
 
+	_STARPU_PTHREAD_RWLOCK_DESTROY(&descr.conditions_rwlock);
 	_STARPU_PTHREAD_KEY_DELETE(memory_node_key);
 }
 

+ 10 - 4
src/datawizard/user_interactions.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -239,11 +239,12 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		.handle = handle,
 		.mode = mode,
 		.node = node,
-		.cond = _STARPU_PTHREAD_COND_INITIALIZER,
-		.lock = _STARPU_PTHREAD_MUTEX_INITIALIZER,
 		.finished = 0
 	};
 
+	_STARPU_PTHREAD_COND_INIT(&wrapper.cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper.lock, NULL);
+
 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
@@ -297,8 +298,9 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		while (!wrapper.finished)
 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
-		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
 	}
+	_STARPU_PTHREAD_COND_DESTROY(&wrapper.cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
 
 	/* At that moment, the caller holds a reference to the piece of data.
 	 * We enqueue the "post" sync task in the list associated to the handle
@@ -381,6 +383,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* we can immediately proceed */
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 
+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
 		free(wrapper);
 
 		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
@@ -410,6 +414,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		while (!wrapper->finished)
 			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
 		free(wrapper);
 	}
 

+ 4 - 2
src/debug/traces/starpu_fxt.c

@@ -86,7 +86,7 @@ static unsigned get_colour_symbol_blue(char *name)
 }
 
 static double last_codelet_start[STARPU_NMAXWORKERS];
-static char last_codelet_symbol[128][STARPU_NMAXWORKERS];
+static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
 
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -144,6 +144,8 @@ static void register_worker_id(unsigned long tid, int workerid)
 
 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
 
+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase the maximum number of CPUs and GPUs to the same value as was used for execution");
+
 	/* only register a thread once */
 	STARPU_ASSERT(entry == NULL);
 
@@ -506,7 +508,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	unsigned long has_name = ev->param[3];
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
-	snprintf(last_codelet_symbol[worker], 128, "%s", name);
+	snprintf(last_codelet_symbol[worker], sizeof(last_codelet_symbol[worker]), "%s", name);
 
 	double start_codelet_time = get_event_time_stamp(ev, options);
 	last_codelet_start[worker] = start_codelet_time;

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -192,7 +192,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
 {
 	size_t global_mem;
-	int limit;
+	ssize_t limit;
 
 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
 #ifdef STARPU_DEVEL

+ 3 - 3
src/drivers/cuda/driver_cuda.c

@@ -73,7 +73,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
  */
 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 {
-	int limit;
+	ssize_t limit;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	char name[30];
@@ -101,8 +101,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 	props[devid].totalGlobalMem -= to_waste;
 #endif /* STARPU_USE_CUDA */
 
-	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
-			devid, (long) to_waste/(1024*1024), limit, (long) totalGlobalMem/(1024*1024),
+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
 			(long) (totalGlobalMem - to_waste)/(1024*1024));
 }
 

+ 3 - 2
src/drivers/driver_common/driver_common.c

@@ -174,11 +174,12 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 		if (_starpu_worker_can_block(memnode))
 			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
-#ifdef STARPU_SIMGRID
 		else
 		{
 			if (_starpu_machine_is_running())
 			{
+				STARPU_UYIELD();
+#ifdef STARPU_SIMGRID
 				static int warned;
 				if (!warned)
 				{
@@ -186,9 +187,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 					_STARPU_DISP("Has to make simgrid spin for progression hooks\n");
 				}
 				MSG_process_sleep(0.000010);
+#endif
 			}
 		}
-#endif
 
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 

+ 12 - 4
src/drivers/opencl/driver_opencl.c

@@ -61,7 +61,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 {
-	int limit;
+	ssize_t limit;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	char name[30];
@@ -90,9 +90,9 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 	to_waste = totalGlobalMem - global_mem[devid];
 #endif
 
-	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
-                      devid, (size_t)to_waste/(1024*1024), limit, (size_t)totalGlobalMem/(1024*1024),
-                      (size_t)(totalGlobalMem - to_waste)/(1024*1024));
+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
+			(long)(totalGlobalMem - to_waste)/(1024*1024));
 
 }
 
@@ -701,11 +701,19 @@ int _starpu_opencl_driver_deinit(struct starpu_driver *d)
 	unsigned memnode = args->memory_node;
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
 #ifndef STARPU_SIMGRID
 	unsigned devid   = args->devid;
         _starpu_opencl_deinit_context(devid);
 #endif
 
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
+
 	return 0;
 }
 

+ 25 - 15
src/profiling/profiling.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,8 +68,10 @@ int _starpu_profiling =
 
 int starpu_profiling_status_set(int status)
 {
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
 	int prev_value = _starpu_profiling;
 	_starpu_profiling = status;
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
 
 	_STARPU_TRACE_SET_PROFILING(status);
 
@@ -94,12 +96,6 @@ int starpu_profiling_status_set(int status)
 	return prev_value;
 }
 
-#undef starpu_profiling_status_get
-int starpu_profiling_status_get(void)
-{
-	return _starpu_profiling;
-}
-
 void _starpu_profiling_init(void)
 {
 	int worker;
@@ -110,7 +106,11 @@ void _starpu_profiling_init(void)
 		_starpu_worker_reset_profiling_info(worker);
 	}
 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
+	{
+		ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
 		_starpu_profiling = STARPU_PROFILING_ENABLE;
+		ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
+	}
 }
 
 void _starpu_profiling_terminate(void)
@@ -127,7 +127,7 @@ struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(str
 	struct starpu_task_profiling_info *info = NULL;
 
 	/* If we are benchmarking, we need room for the power consumption */
-	if (_starpu_profiling || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
+	if (starpu_profiling_status_get() || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
 	{
 		info = (struct starpu_task_profiling_info *) calloc(1, sizeof(struct starpu_task_profiling_info));
 		STARPU_ASSERT(info);
@@ -191,7 +191,7 @@ void _starpu_worker_reset_profiling_info(int workerid)
 
 void _starpu_worker_restart_sleeping(int workerid)
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 		struct timespec sleep_start_time;
 		_starpu_clock_gettime(&sleep_start_time);
@@ -205,7 +205,7 @@ void _starpu_worker_restart_sleeping(int workerid)
 
 void _starpu_worker_stop_sleeping(int workerid)
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 		struct timespec *sleeping_start, sleep_end_time;
 
@@ -240,7 +240,7 @@ void _starpu_worker_stop_sleeping(int workerid)
 
 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start)
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 		worker_registered_executing_start[workerid] = 1;
@@ -252,7 +252,7 @@ void _starpu_worker_register_executing_start_date(int workerid, struct timespec
 
 void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 
@@ -272,7 +272,7 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 	{
 		/* Not thread safe, shouldn't be too much a problem */
 		info->executed_tasks = worker_info[workerid].executed_tasks;
@@ -319,7 +319,7 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 /* When did the task reach the scheduler  ? */
 void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 		return;
 
 	struct starpu_task_profiling_info *profiling_info;
@@ -331,7 +331,7 @@ void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 
 void _starpu_profiling_set_task_push_end_time(struct starpu_task *task)
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 		return;
 
 	struct starpu_task_profiling_info *profiling_info;
@@ -429,3 +429,13 @@ void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
 	bus_profiling_info[src_node][dst_node].transfer_count++;
 //	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
 }
+
+#undef starpu_profiling_status_get
+int starpu_profiling_status_get(void)
+{
+	int ret;
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
+	ret = _starpu_profiling;
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
+	return ret;
+}

+ 1 - 0
tests/Makefile.am

@@ -103,6 +103,7 @@ noinst_PROGRAMS =				\
 	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/run_driver              \
+	main/deploop                            \
 	main/restart				\
 	main/execute_on_a_specific_worker	\
 	main/insert_task			\

+ 1 - 1
tests/loader.c

@@ -120,7 +120,7 @@ static void test_cleaner(int sig)
 	fprintf(stderr, "[error] test %s has been blocked for %d seconds. Mark it as failed\n", test_name, timeout);
 	child_gid = getpgid(child_pid);
 	launch_gdb(test_name);
-	kill(-child_gid, SIGKILL);
+	kill(-child_gid, SIGQUIT);
 	exit(EXIT_FAILURE);
 }
 

+ 92 - 0
tests/main/deploop.c

@@ -0,0 +1,92 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Create task A and B such that
+ * - B depends on A by tag dependency.
+ * - A would depend on B by data dependency, but we disable that.
+ */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include "../helper.h"
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	FPRINTF(stderr,"executing task %p\n", starpu_task_get_current());
+}
+
+static struct starpu_codelet dummy_codelet = 
+{
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 1,
+	.modes = { STARPU_RW }
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+	starpu_data_handle_t handle;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_void_data_register(&handle);
+
+	struct starpu_task *taskA, *taskB;
+
+	/* Make B depend on A */
+	starpu_tag_declare_deps(1, 1, (starpu_tag_t) 0);
+
+	taskA = starpu_task_create();
+	taskA->cl = &dummy_codelet;
+	taskA->tag_id = 0;
+	taskA->use_tag = 1;
+	taskA->handles[0] = handle;
+	taskA->sequential_consistency = 0;
+	FPRINTF(stderr,"A is %p\n", taskA);
+
+	taskB = starpu_task_create();
+	taskB->cl = &dummy_codelet;
+	taskB->tag_id = 1;
+	taskB->use_tag = 1;
+	taskB->handles[0] = handle;
+	FPRINTF(stderr,"B is %p\n", taskB);
+
+	ret = starpu_task_submit(taskB);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+	ret = starpu_task_submit(taskA);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(handle);
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 1 - 1
tests/microbenchs/tasks_overhead.c

@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
 	gettimeofday(&end_exec, NULL);
 
-	for (i = 1; i < ntasks; i++)
+	for (i = 0; i < ntasks; i++)
 		starpu_task_clean(&tasks[i]);
 
 	for (buffer = 0; buffer < nbuffers; buffer++)

+ 19 - 30
tools/valgrind/starpu.suppr

@@ -1,18 +1,4 @@
 {
-   config.running is not racy from starpu_shutdown
-   Helgrind:Race
-   fun:starpu_shutdown
-   ...
-}
-
-{
-   config.running is not racy from _starpu_machine_is_running
-   Helgrind:Race
-   fun:_starpu_machine_is_running
-   ...
-}
-
-{
    don't care about cache hit stats
    Helgrind:Race
    fun:_starpu_msi_cache_hit
@@ -48,37 +34,31 @@
 }
 
 {
-   We do not care about the race on the entry->mean variable, we only want a good-enough estimation.
-   Helgrind:Race
-   fun: _starpu_history_based_job_expected_perf
-   ...
-}
-
-{
    We do not care about races on profiling statistics
    Helgrind:Race
-   fun: starpu_profiling_status_get
+   fun:_starpu_worker_get_status
+   fun:_starpu_worker_reset_profiling_info_with_lock
    ...
 }
 
 {
    This is racy, but since we'll always put the same values, this is not a problem.
    Helgrind:Race
-   fun: _starpu_codelet_check_deprecated_fields
+   fun:_starpu_codelet_check_deprecated_fields
    ...
 }
 
 {
    This is racy, but we don't care, it's only a statistic
    Helgrind:Race
-   fun: starpu_task_nsubmitted
+   fun:starpu_task_nsubmitted
    ...
 }
 
 {
    This is racy, but we don't care, it's only a statistic
    Helgrind:Race
-   fun: starpu_task_nready
+   fun:starpu_task_nready
    ...
 }
 
@@ -92,18 +72,27 @@
 }
 
 {
-   This is racy, but we don't care, if the function was called a bit earlier we would have had a different value
+   This is racy, but keep it away for now, otherwise it clutters the buildbot log
    Helgrind:Race
-   fun: _starpu_fifo_empty
-   fun: pop_task_eager_policy
+   fun:_starpu_fifo_empty
+   fun:pop_task_eager_policy
    ...
 }
 
 {
    This is the counterpart of the suppression above
    Helgrind:Race
-   fun: _starpu_fifo_push_task
-   fun: push_task_eager_policy
+   fun:_starpu_fifo_push_task
+   fun:push_task_eager_policy
    ...
 }
 
+
+{
+   This is the counterpart of the suppression above
+   Helgrind:Race
+   fun:_starpu_fifo_push_sorted_task
+   fun:_starpu_fifo_push_task
+   fun:push_task_eager_policy
+   ...
+}