Marc Sergent лет назад: 13
Родитель
Сommit
a007c4d009
46 измененных файлов с 617 добавлено и 259 удалено
  1. 3 0
      ChangeLog
  2. 8 0
      configure.ac
  3. 7 1
      doc/chapters/basic-api.texi
  4. 5 1
      doc/chapters/configuration.texi
  5. 6 5
      examples/tag_example/tag_restartable.c
  6. 9 2
      include/starpu_profiling.h
  7. 5 0
      include/starpu_task.h
  8. 9 1
      include/starpu_util.h
  9. 6 6
      mpi/examples/Makefile.am
  10. 5 3
      mpi/examples/stencil/stencil5.c
  11. 18 18
      mpi/src/starpu_mpi.c
  12. 25 35
      mpi/src/starpu_mpi_collective.c
  13. 3 0
      mpi/tests/mpi_probe.c
  14. 8 8
      mpi/tests/user_defined_datatype.c
  15. 108 30
      sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
  16. 3 0
      src/common/starpu_spinlock.c
  17. 49 1
      src/common/utils.h
  18. 4 1
      src/core/dependencies/implicit_data_deps.c
  19. 12 5
      src/core/dependencies/tags.c
  20. 12 0
      src/core/perfmodel/perfmodel_history.c
  21. 1 1
      src/core/sched_ctx.c
  22. 8 0
      src/core/task.c
  23. 7 1
      src/core/workers.c
  24. 11 0
      src/datawizard/data_request.c
  25. 3 1
      src/datawizard/datawizard.c
  26. 13 11
      src/datawizard/filters.c
  27. 1 1
      src/datawizard/interfaces/block_interface.c
  28. 51 10
      src/datawizard/interfaces/data_interface.c
  29. 1 1
      src/datawizard/interfaces/matrix_interface.c
  30. 1 1
      src/datawizard/interfaces/vector_interface.c
  31. 10 8
      src/datawizard/malloc.c
  32. 31 43
      src/datawizard/memalloc.c
  33. 1 1
      src/datawizard/memalloc.h
  34. 1 0
      src/datawizard/memory_nodes.c
  35. 10 4
      src/datawizard/user_interactions.c
  36. 4 2
      src/debug/traces/starpu_fxt.c
  37. 1 1
      src/drivers/cpu/driver_cpu.c
  38. 3 3
      src/drivers/cuda/driver_cuda.c
  39. 3 2
      src/drivers/driver_common/driver_common.c
  40. 12 4
      src/drivers/opencl/driver_opencl.c
  41. 25 15
      src/profiling/profiling.c
  42. 1 0
      tests/Makefile.am
  43. 1 1
      tests/loader.c
  44. 92 0
      tests/main/deploop.c
  45. 1 1
      tests/microbenchs/tasks_overhead.c
  46. 19 30
      tools/valgrind/starpu.suppr

+ 3 - 0
ChangeLog

@@ -116,6 +116,9 @@ Small features:
   * File STARPU-REVISION --- containing the SVN revision number from which
   * File STARPU-REVISION --- containing the SVN revision number from which
     StarPU was compiled --- is installed in the share/doc/starpu directory
     StarPU was compiled --- is installed in the share/doc/starpu directory
   * starpu_perfmodel_plot can now directly draw GFlops curves.
   * starpu_perfmodel_plot can now directly draw GFlops curves.
+  * New configure option --enable-mpi-progression-hook to enable the
+    activity polling method for StarPU-MPI.
+  * Permit to disable sequential consistency for a given task.
 
 
 Changes:
 Changes:
   * Fix the block filter functions.
   * Fix the block filter functions.

+ 8 - 0
configure.ac

@@ -221,6 +221,7 @@ AM_CONDITIONAL([STARPU_LONG_CHECK], [test "x$enable_long_check" = "xyes"])
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
 
 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
+AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
 
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
@@ -1234,6 +1235,13 @@ if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 fi
 fi
 
 
+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
+				   [Enable StarPU MPI activity polling method])],
+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
+if  test x$enable_mpi_progression_hook = xyes; then
+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
+fi
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                               StarPU-Top                                    #
 #                               StarPU-Top                                    #

+ 7 - 1
doc/chapters/basic-api.texi

@@ -1805,9 +1805,15 @@ contained in the @code{tag_id} field. Tag allow the application to synchronize
 with the task and to express task dependencies easily.
 with the task and to express task dependencies easily.
 
 
 @item @code{starpu_tag_t tag_id}
 @item @code{starpu_tag_t tag_id}
-This fields contains the tag associated to the task if the @code{use_tag} field
+This field contains the tag associated to the task if the @code{use_tag} field
 was set, it is ignored otherwise.
 was set, it is ignored otherwise.
 
 
+@item @code{unsigned sequential_consistency}
+If this flag is set (which is the default), sequential consistency is enforced
+for the data parameters of this task for which sequential consistency is
+enabled. Clearing this flag permits to disable sequential consistency for this
+task, even if data have it enabled.
+
 @item @code{unsigned synchronous}
 @item @code{unsigned synchronous}
 If this flag is set, the @code{starpu_task_submit} function is blocking and
 If this flag is set, the @code{starpu_task_submit} function is blocking and
 returns only when the task has been executed (or if no worker is able to
 returns only when the task has been executed (or if no worker is able to

+ 5 - 1
doc/chapters/configuration.texi

@@ -209,10 +209,14 @@ enabled when the GCC compiler provides a plug-in support.
 @end defvr
 @end defvr
 
 
 @defvr {Configure option} --with-mpicc=@var{path}
 @defvr {Configure option} --with-mpicc=@var{path}
-Use the @command{mpicc} compiler at @var{path}, for starpumpi
+Use the @command{mpicc} compiler at @var{path}, for StarPU-MPI.
 (@pxref{StarPU MPI support}).
 (@pxref{StarPU MPI support}).
 @end defvr
 @end defvr
 
 
+@defvr {Configure option} --enable-mpi-progression-hook
+Enable the activity polling method for StarPU-MPI.
+@end defvr
+
 @node Advanced configuration
 @node Advanced configuration
 @subsection Advanced configuration
 @subsection Advanced configuration
 
 

+ 6 - 5
examples/tag_example/tag_restartable.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -106,7 +106,7 @@ static int start_task_grid(unsigned iter)
 	return 0;
 	return 0;
 }
 }
 
 
-void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
+void cpu_codelet(void *descr[] __attribute__((unused)), void *_args __attribute__((unused)))
 {
 {
 /*	int i = (uintptr_t) _args;
 /*	int i = (uintptr_t) _args;
 	printf("doing %x\n", i);
 	printf("doing %x\n", i);
@@ -117,7 +117,7 @@ void cpu_codelet(void *descr[], void *_args __attribute__((unused)))
 
 
 int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 {
 {
-	unsigned i;
+	unsigned i, j;
 	int ret;
 	int ret;
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
@@ -161,8 +161,9 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	FPRINTF(stderr, "TEST DONE ...\n");
 	FPRINTF(stderr, "TEST DONE ...\n");
 
 
 enodev:
 enodev:
-	for (i = 0; i < Nrolls; i++)
-	{
+	for (i = 0; i < Nrolls; i++) {
+		for (j = 0; j < ni; j++)
+			starpu_task_destroy(tasks[i][j]);
 		free(tasks[i]);
 		free(tasks[i]);
 	}
 	}
 
 

+ 9 - 2
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,8 +101,15 @@ int starpu_profiling_status_set(int status);
  * error. */
  * error. */
 int starpu_profiling_status_get(void);
 int starpu_profiling_status_get(void);
 #ifdef BUILDING_STARPU
 #ifdef BUILDING_STARPU
+#include <common/utils.h>
 extern int _starpu_profiling;
 extern int _starpu_profiling;
-#define starpu_profiling_status_get() _starpu_profiling
+#define starpu_profiling_status_get() ({ \
+	int __ret; \
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling); \
+	__ret = _starpu_profiling; \
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling); \
+	__ret; \
+})
 #endif
 #endif
 
 
 /* Get the profiling info associated to a worker, and reset the profiling
 /* Get the profiling info associated to a worker, and reset the profiling

+ 5 - 0
include/starpu_task.h

@@ -129,9 +129,14 @@ struct starpu_task
 	void (*callback_func)(void *);
 	void (*callback_func)(void *);
 	void *callback_arg;
 	void *callback_arg;
 
 
+	/* Whether tag_id should be considered */
 	unsigned use_tag;
 	unsigned use_tag;
+	/* Tag associated with this task */
 	starpu_tag_t tag_id;
 	starpu_tag_t tag_id;
 
 
+	/* Whether we should enforce sequential consistency for this task */
+	unsigned sequential_consistency;
+
 	/* options for the task execution */
 	/* options for the task execution */
 	unsigned synchronous; /* if set, a call to push is blocking */
 	unsigned synchronous; /* if set, a call to push is blocking */
 	int priority; /* STARPU_MAX_PRIO = most important; STARPU_MIN_PRIO = least important */
 	int priority; /* STARPU_MAX_PRIO = most important; STARPU_MIN_PRIO = least important */

+ 9 - 1
include/starpu_util.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -186,6 +186,14 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 #endif
 
 
+/* This is needed in some places to make valgrind yield to another thread to be
+ * able to progress.  */
+#if defined(__i386__) || defined(__x86_64__)
+#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
+#else
+#define STARPU_UYIELD() ((void)0)
+#endif
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 6 - 6
mpi/examples/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2009-2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -88,7 +88,7 @@ examplebin_PROGRAMS +=				\
 	stencil/stencil5
 	stencil/stencil5
 
 
 stencil_stencil5_LDADD =		\
 stencil_stencil5_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
 
 
 starpu_mpi_EXAMPLES	+=	\
 starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5
 	stencil/stencil5
@@ -106,7 +106,7 @@ examplebin_PROGRAMS += 			\
 mpi_lu_plu_example_float_LDADD =	\
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 
 mpi_lu_plu_example_float_SOURCES =	\
 mpi_lu_plu_example_float_SOURCES =	\
 	mpi_lu/plu_example_float.c	\
 	mpi_lu/plu_example_float.c	\
@@ -118,7 +118,7 @@ mpi_lu_plu_example_float_SOURCES =	\
 mpi_lu_plu_example_double_LDADD =	\
 mpi_lu_plu_example_double_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 
 mpi_lu_plu_example_double_SOURCES =	\
 mpi_lu_plu_example_double_SOURCES =	\
 	mpi_lu/plu_example_double.c	\
 	mpi_lu/plu_example_double.c	\
@@ -148,7 +148,7 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 
 
 matrix_decomposition_mpi_cholesky_LDADD =			\
 matrix_decomposition_mpi_cholesky_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 
 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 	matrix_decomposition/mpi_cholesky_distributed.c	\
 	matrix_decomposition/mpi_cholesky_distributed.c	\
@@ -161,7 +161,7 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 
 
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_BLAS_LDFLAGS)
+	$(STARPU_BLAS_LDFLAGS) -lm
 
 
 starpu_mpi_EXAMPLES +=				\
 starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky			\

+ 5 - 3
mpi/examples/stencil/stencil5.c

@@ -37,12 +37,14 @@ struct starpu_codelet stencil5_cl =
 };
 };
 
 
 #ifdef STARPU_QUICK_CHECK
 #ifdef STARPU_QUICK_CHECK
-#  define NITER_DEF	10
+#  define NITER_DEF	5
+#  define X         	3
+#  define Y         	3
 #else
 #else
 #  define NITER_DEF	500
 #  define NITER_DEF	500
+#  define X         	20
+#  define Y         	20
 #endif
 #endif
-#define X         20
-#define Y         20
 
 
 int display = 0;
 int display = 0;
 int niter = NITER_DEF;
 int niter = NITER_DEF;

+ 18 - 18
mpi/src/starpu_mpi.c

@@ -23,11 +23,7 @@
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_insert_task.h>
 #include <starpu_mpi_insert_task.h>
-
-#ifdef STARPU_DEVEL
-#  warning TODO find a better way to select the polling method (perhaps during the configuration)
-#endif
-//#define USE_STARPU_ACTIVITY	1
+#include <common/config.h>
 
 
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
@@ -643,6 +639,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 		MPI_Status status;
 		MPI_Status status;
 		memset(&status, 0, sizeof(MPI_Status));
 		memset(&status, 0, sizeof(MPI_Status));
 		req->ret = MPI_Recv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &status);
 		req->ret = MPI_Recv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &status);
+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 	}
 	}
 
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ || req->request_type == PROBE_REQ)
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ || req->request_type == PROBE_REQ)
@@ -699,7 +696,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
 static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
 {
 {
 	unsigned may_block = 1;
 	unsigned may_block = 1;
@@ -714,7 +711,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
 
 	return may_block;
 	return may_block;
 }
 }
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 
 static void _starpu_mpi_test_detached_requests(void)
 static void _starpu_mpi_test_detached_requests(void)
 {
 {
@@ -885,9 +882,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* shall we block ? */
 		/* shall we block ? */
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 
 
-#ifndef USE_STARPU_ACTIVITY
+#ifndef STARPU_MPI_ACTIVITY
 		block = block && _starpu_mpi_req_list_empty(detached_requests);
 		block = block && _starpu_mpi_req_list_empty(detached_requests);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 
 		if (block)
 		if (block)
 		{
 		{
@@ -946,20 +943,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 /*                                                      */
 /*                                                      */
 /********************************************************/
 /********************************************************/
 
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 static int hookid = - 1;
 static int hookid = - 1;
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 
 static void _starpu_mpi_add_sync_point_in_fxt(void)
 static void _starpu_mpi_add_sync_point_in_fxt(void)
 {
 {
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	int rank;
 	int rank;
 	int worldsize;
 	int worldsize;
+	int ret;
+
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 
 
-	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
-	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+	ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 
 	/* We generate a "unique" key so that we can make sure that different
 	/* We generate a "unique" key so that we can make sure that different
 	 * FxT traces come from the same MPI run. */
 	 * FxT traces come from the same MPI run. */
@@ -973,7 +972,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 		random_number = rand();
 		random_number = rand();
 	}
 	}
 
 
-	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
 
 
 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
 
 
@@ -1006,10 +1006,10 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	STARPU_ASSERT(hookid >= 0);
 	STARPU_ASSERT(hookid >= 0);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
@@ -1058,9 +1058,9 @@ int starpu_mpi_shutdown(void)
 
 
 	pthread_join(progress_thread, &value);
 	pthread_join(progress_thread, &value);
 
 
-#ifdef USE_STARPU_ACTIVITY
+#ifdef STARPU_MPI_ACTIVITY
 	starpu_progression_hook_deregister(hookid);
 	starpu_progression_hook_deregister(hookid);
-#endif
+#endif /* STARPU_MPI_ACTIVITY */
 
 
 	TRACE_MPI_STOP(rank, world_size);
 	TRACE_MPI_STOP(rank, world_size);
 
 

+ 25 - 35
mpi/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,26 +42,23 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 {
 {
 	int rank;
 	int rank;
 	int x;
 	int x;
-	struct _callback_arg *callback_arg;
-	void (*callback_func)(void *);
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
 
 
 	MPI_Comm_rank(comm, &rank);
 	MPI_Comm_rank(comm, &rank);
 
 
-	callback_func = _callback_collective;
-	callback_arg = malloc(sizeof(struct _callback_arg));
-	callback_arg->count = 0;
-	callback_arg->nb = 0;
-	callback_arg->callback = (rank == root) ? scallback : rcallback;
-	callback_arg->arg = (rank == root) ? sarg : rarg;
-	if (callback_arg->callback == NULL)
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
 	{
 	{
-		free(callback_arg);
-		callback_arg = NULL;
-		callback_func = NULL;
-	}
+		callback_func = _callback_collective;
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = (rank == root) ? scallback : rcallback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
+		if (callback_arg->callback == NULL)
 
 
-	if (callback_arg)
-	{
 		for(x = 0; x < count ; x++)
 		for(x = 0; x < count ; x++)
 		{
 		{
 			if (data_handles[x])
 			if (data_handles[x])
@@ -107,29 +104,23 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 {
 {
 	int rank;
 	int rank;
 	int x;
 	int x;
-	struct _callback_arg *callback_arg;
-	void (*callback_func)(void *);
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
 
 
 	MPI_Comm_rank(comm, &rank);
 	MPI_Comm_rank(comm, &rank);
 
 
-#ifdef STARPU_DEVEL
-#warning TODO: callback_arg needs to be free-ed
-#endif
-	callback_func = _callback_collective;
-	callback_arg = malloc(sizeof(struct _callback_arg));
-	callback_arg->count = 0;
-	callback_arg->nb = 0;
-	callback_arg->callback = (rank == root) ? scallback : rcallback;
-	callback_arg->arg = (rank == root) ? sarg : rarg;
-	if (callback_arg->callback == NULL)
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
 	{
 	{
-		free(callback_arg);
-		callback_arg = NULL;
-		callback_func = NULL;
-	}
+		callback_func = _callback_collective;
+
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = callback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
 
 
-	if (callback_arg)
-	{
 		for(x = 0; x < count ; x++)
 		for(x = 0; x < count ; x++)
 		{
 		{
 			if (data_handles[x])
 			if (data_handles[x])
@@ -170,4 +161,3 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 	}
 	}
 	return 0;
 	return 0;
 }
 }
-

+ 3 - 0
mpi/tests/mpi_probe.c

@@ -45,6 +45,8 @@ void callback(void *arg __attribute__((unused)))
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
+	return 77;
+	/*
 	int ret, rank, size;
 	int ret, rank, size;
 
 
 	MPI_Init(NULL, NULL);
 	MPI_Init(NULL, NULL);
@@ -99,4 +101,5 @@ int main(int argc, char **argv)
 	MPI_Finalize();
 	MPI_Finalize();
 
 
 	return 0;
 	return 0;
+	*/
 }
 }

+ 8 - 8
mpi/tests/user_defined_datatype.c

@@ -24,36 +24,36 @@
 #  define ELEMENTS 1000
 #  define ELEMENTS 1000
 #endif
 #endif
 
 
-typedef void (*test_func)(starpu_data_handle_t *, int, int);
+typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
 
 
-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank)
+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
 {
 {
 	int i;
 	int i;
 
 
 	for(i=0 ; i<nb_handles ; i++)
 	for(i=0 ; i<nb_handles ; i++)
 	{
 	{
 		starpu_data_set_rank(handles[i], 1);
 		starpu_data_set_rank(handles[i], 1);
-		starpu_data_set_tag(handles[i], i+100);
+		starpu_data_set_tag(handles[i], i+tag);
 	}
 	}
 
 
 	for(i=0 ; i<nb_handles ; i++)
 	for(i=0 ; i<nb_handles ; i++)
 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
 }
 }
 
 
-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank)
+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
 {
 {
 	int i;
 	int i;
 
 
 	if (rank == 1)
 	if (rank == 1)
 	{
 	{
 		for(i=0 ; i<nb_handles ; i++)
 		for(i=0 ; i<nb_handles ; i++)
-			starpu_mpi_send(handles[i], 0, i+100, MPI_COMM_WORLD);
+			starpu_mpi_send(handles[i], 0, i+tag, MPI_COMM_WORLD);
 	}
 	}
 	else if (rank == 0)
 	else if (rank == 0)
 	{
 	{
 		MPI_Status statuses[nb_handles];
 		MPI_Status statuses[nb_handles];
 		for(i=0 ; i<nb_handles ; i++)
 		for(i=0 ; i<nb_handles ; i++)
-			starpu_mpi_recv(handles[i], 1, i+100, MPI_COMM_WORLD, &statuses[i]);
+			starpu_mpi_recv(handles[i], 1, i+tag, MPI_COMM_WORLD, &statuses[i]);
 	}
 	}
 }
 }
 
 
@@ -126,8 +126,8 @@ int main(int argc, char **argv)
 				starpu_variable_data_register(&handle_vars[i], 0, (uintptr_t)&foo[i], sizeof(double));
 				starpu_variable_data_register(&handle_vars[i], 0, (uintptr_t)&foo[i], sizeof(double));
 			}
 			}
 
 
-			f(handle_vars, ELEMENTS, rank);
-			f(handle_complex, ELEMENTS, rank);
+			f(handle_vars, ELEMENTS, rank, ELEMENTS);
+			f(handle_complex, ELEMENTS, rank, 4*ELEMENTS);
 
 
 			for(i=0 ; i<ELEMENTS ; i++)
 			for(i=0 ; i<ELEMENTS ; i++)
 			{
 			{

+ 108 - 30
sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c

@@ -21,8 +21,10 @@
 static struct bound_task_pool *task_pools = NULL;
 static struct bound_task_pool *task_pools = NULL;
 
 
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger);
-static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger,
+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs);
+static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], 
+						     int *sched_ctxs, int *workers, struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
 {
 {
 	double draft_tasks[nw][nt];
 	double draft_tasks[nw][nt];
 	double draft_w_in_s[ns][nw];
 	double draft_w_in_s[ns][nw];
@@ -45,7 +47,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 	/* smallest possible tmax, difficult to obtain as we
 	/* smallest possible tmax, difficult to obtain as we
 	   compute the nr of flops and not the tasks */
 	   compute the nr of flops and not the tasks */
 	double possible_tmax = _lp_get_tmax(nw, workers);
 	double possible_tmax = _lp_get_tmax(nw, workers);
-	double smallest_tmax = possible_tmax / 2;
+	double smallest_tmax = possible_tmax / 3;
 	double tmax = possible_tmax * ns;
 	double tmax = possible_tmax * ns;
 	double res = 1.0;
 	double res = 1.0;
 	unsigned has_sol = 0;
 	unsigned has_sol = 0;
@@ -53,6 +55,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 	double old_tmax = 0.0;
 	double old_tmax = 0.0;
 	unsigned found_sol = 0;
 	unsigned found_sol = 0;
 
 
+//	printf("tmin = %lf tmax = %lf \n", tmin, tmax);
 	struct timeval start_time;
 	struct timeval start_time;
 	struct timeval end_time;
 	struct timeval end_time;
 	int nd = 0;
 	int nd = 0;
@@ -65,7 +68,7 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 		/* find solution and save the values in draft tables
 		/* find solution and save the values in draft tables
 		   only if there is a solution for the system we save them
 		   only if there is a solution for the system we save them
 		   in the proper table */
 		   in the proper table */
-		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1);
+		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1, tmp_task_pools, size_ctxs);
 		if(res != 0.0)
 		if(res != 0.0)
 		{
 		{
 			for(w = 0; w < nw; w++)
 			for(w = 0; w < nw; w++)
@@ -129,7 +132,7 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
 
 	double w_in_s[ns][nw];
 	double w_in_s[ns][nw];
 	double tasks[nw][nt];
 	double tasks[nw][nt];
-	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers);
+	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers, task_pools, 1);
 	pthread_mutex_unlock(&mutex);
 	pthread_mutex_unlock(&mutex);
 	/* if we did find at least one solution redistribute the resources */
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
 	if(found_sol)
@@ -194,7 +197,6 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 {
 {
 	/* count the tasks of the same type */
 	/* count the tasks of the same type */
-	pthread_mutex_lock(&mutex);
 	struct bound_task_pool *tp = NULL;
 	struct bound_task_pool *tp = NULL;
 
 
 	for (tp = task_pools; tp; tp = tp->next)
 	for (tp = task_pools; tp; tp = tp->next)
@@ -209,20 +211,36 @@ static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
 			tp->n--;
 			tp->n--;
 		else
 		else
 		{
 		{
-			struct bound_task_pool *prev_tp = NULL;
-			for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
+			if(tp == task_pools)
 			{
 			{
-				if (prev_tp->next == tp)
-					prev_tp->next = tp->next;
+				struct bound_task_pool *next_tp = NULL;
+				if(task_pools->next)
+					next_tp = task_pools->next;
+
+				free(tp);
+				tp = NULL;
+				
+				if(next_tp)
+					task_pools = next_tp;
+				
+			}
+			else
+			{
+				struct bound_task_pool *prev_tp = NULL;
+				for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
+				{
+					if (prev_tp->next == tp)
+						prev_tp->next = tp->next;
+				}
+				
+				free(tp);
+				tp = NULL;
 			}
 			}
-
-			free(tp);
 		}
 		}
 	}
 	}
-	pthread_mutex_unlock(&mutex);
 }
 }
 
 
-static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
+static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs)
 {
 {
         struct bound_task_pool *tp;
         struct bound_task_pool *tp;
         int w, t;
         int w, t;
@@ -230,14 +248,33 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
         {
         {
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
                 {
-                        enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_perf_archtype(w) :
-				starpu_worker_get_perf_archtype(workers[w]);
+			int worker = workers == NULL ? w : workers[w];
+                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(worker);
                         double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
                         double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
 
                         if (isnan(length))
                         if (isnan(length))
                                 times[w][t] = NAN;
                                 times[w][t] = NAN;
-                       else
+			else
+			{
                                 times[w][t] = length / 1000.;
                                 times[w][t] = length / 1000.;
+
+				double transfer_time = 0.0;
+				enum starpu_archtype arch = starpu_worker_get_type(worker);
+				if(arch == STARPU_CUDA_WORKER)
+				{
+					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
+					if(!worker_in_ctx && !size_ctxs)
+					{
+						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
+						transfer_time +=  (tp->footprint / transfer_velocity) / 1000. ;
+					}
+					double latency = starpu_get_latency_RAM_CUDA(worker);
+					transfer_time += latency/1000.;
+
+				}
+//				printf("%d/%d %s x %d time = %lf transfer_time = %lf\n", w, tp->sched_ctx_id, tp->cl->model->symbol, tp->n, times[w][t], transfer_time);
+				times[w][t] += transfer_time;
+			}
                 }
                 }
         }
         }
 }
 }
@@ -247,9 +284,10 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
  */
  */
 #ifdef STARPU_HAVE_GLPK_H
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer)
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer,
+			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
 {
 {
-	if(task_pools == NULL)
+	if(tmp_task_pools == NULL)
 		return 0.0;
 		return 0.0;
 	struct bound_task_pool * tp;
 	struct bound_task_pool * tp;
 	int t, w, s;
 	int t, w, s;
@@ -270,7 +308,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 		int ia[ne], ja[ne];
 		int ia[ne], ja[ne];
 		double ar[ne];
 		double ar[ne];
 
 
-		_get_tasks_times(nw, nt, times, workers);
+		_get_tasks_times(nw, nt, times, workers, size_ctxs);
 
 
 		/* Variables: number of tasks i assigned to worker j, and tmax */
 		/* Variables: number of tasks i assigned to worker j, and tmax */
 		glp_add_cols(lp, nw*nt+ns*nw);
 		glp_add_cols(lp, nw*nt+ns*nw);
@@ -280,7 +318,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
 				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
 
 
 		for (w = 0; w < nw; w++)
 		for (w = 0; w < nw; w++)
-			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+			for (t = 0; t < nt; t++)
 			{
 			{
 				char name[32];
 				char name[32];
 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
@@ -313,7 +351,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 		int curr_row_idx = 0;
 		int curr_row_idx = 0;
 		/* Total worker execution time */
 		/* Total worker execution time */
 		glp_add_rows(lp, nw*ns);
 		glp_add_rows(lp, nw*ns);
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0; t < nt; t++)
 		{
 		{
 			int someone = 0;
 			int someone = 0;
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
@@ -336,7 +374,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				starpu_worker_get_name(w, name, sizeof(name));
 				starpu_worker_get_name(w, name, sizeof(name));
 				snprintf(title, sizeof(title), "worker %s", name);
 				snprintf(title, sizeof(title), "worker %s", name);
 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
-				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+				for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
 				{
 				{
 					if((int)tp->sched_ctx_id == sched_ctxs[s])
 					if((int)tp->sched_ctx_id == sched_ctxs[s])
 					{
 					{
@@ -362,7 +400,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
 
 		/* Total task completion */
 		/* Total task completion */
 		glp_add_rows(lp, nt);
 		glp_add_rows(lp, nt);
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
 		{
 		{
 			char name[32], title[64];
 			char name[32], title[64];
 			starpu_worker_get_name(w, name, sizeof(name));
 			starpu_worker_get_name(w, name, sizeof(name));
@@ -411,6 +449,12 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	glp_init_smcp(&parm);
 	glp_init_smcp(&parm);
 	parm.msg_lev = GLP_MSG_OFF;
 	parm.msg_lev = GLP_MSG_OFF;
 	int ret = glp_simplex(lp, &parm);
 	int ret = glp_simplex(lp, &parm);
+
+/* 	char str[50]; */
+/* 	sprintf(str, "outpu_lp_%g", tmax); */
+
+/* 	glp_print_sol(lp, str); */
+
 	if (ret)
 	if (ret)
 	{
 	{
 		printf("error in simplex\n");
 		printf("error in simplex\n");
@@ -449,7 +493,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
 
 	double res = glp_get_obj_val(lp);
 	double res = glp_get_obj_val(lp);
 	for (w = 0; w < nw; w++)
 	for (w = 0; w < nw; w++)
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		for (t = 0; t < nt; t++)
 /* 			if (integer) */
 /* 			if (integer) */
 /* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
 /* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
 /*                         else */
 /*                         else */
@@ -471,10 +515,18 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	return res;
 	return res;
 }
 }
 
 
+static struct bound_task_pool* _clone_linked_list(struct bound_task_pool *tp)
+{
+	if(tp == NULL) return NULL;
+
+	struct bound_task_pool *tmp_tp = (struct bound_task_pool*)malloc(sizeof(struct bound_task_pool));
+	memcpy(tmp_tp, tp, sizeof(struct bound_task_pool));
+	tmp_tp->next = _clone_linked_list(tp->next);
+	return tmp_tp;
+}
 
 
 static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
 {
-	_remove_task_from_pool(task, footprint);
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
 
 
 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
@@ -491,24 +543,50 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 			int nt = 0; /* Number of different kinds of tasks */
 			int nt = 0; /* Number of different kinds of tasks */
-			pthread_mutex_lock(&mutex);
-			struct bound_task_pool * tp;
+
+//			pthread_mutex_lock(&mutex);
+
+			/* we don't take the mutex bc a correct value of the number of tasks is
+			   not required but we do a copy in order to be sure
+			   that the linear progr won't segfault if the list of 
+			   submitted task will change during the exec */
+
+			struct bound_task_pool *tp = NULL;
+			struct bound_task_pool *tmp_task_pools = _clone_linked_list(task_pools);
+
 			for (tp = task_pools; tp; tp = tp->next)
 			for (tp = task_pools; tp; tp = tp->next)
 				nt++;
 				nt++;
 
 
+
 			double w_in_s[ns][nw];
 			double w_in_s[ns][nw];
 			double tasks_per_worker[nw][nt];
 			double tasks_per_worker[nw][nt];
 
 
-			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
-			pthread_mutex_unlock(&mutex);
+			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL, tmp_task_pools, 0);
+//			pthread_mutex_unlock(&mutex);
+
 			/* if we did find at least one solution redistribute the resources */
 			/* if we did find at least one solution redistribute the resources */
 			if(found_sol)
 			if(found_sol)
 				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
 				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
 
 
+			struct bound_task_pool *next = NULL;
+			struct bound_task_pool *tmp_tp = tmp_task_pools;
+			while(tmp_task_pools)
+			{
+				next = tmp_tp->next;
+				free(tmp_tp);
+				tmp_tp = next;
+				tmp_task_pools = next;
+			}
+			
 
 
 		}
 		}
 		pthread_mutex_unlock(&act_hypervisor_mutex);
 		pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 	}
+	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
+//	pthread_mutex_lock(&mutex);
+	_remove_task_from_pool(task, footprint);
+//	pthread_mutex_unlock(&mutex);
+
 }
 }
 
 
 
 

+ 3 - 0
src/common/starpu_spinlock.c

@@ -82,6 +82,7 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 		/* Give hand to another thread, hopefully the one which has the
 		/* Give hand to another thread, hopefully the one which has the
 		 * spinlock and probably just has also a short-lived mutex. */
 		 * spinlock and probably just has also a short-lived mutex. */
 		MSG_process_sleep(0.000001);
 		MSG_process_sleep(0.000001);
+		STARPU_UYIELD();
 	}
 	}
 #elif defined(STARPU_SPINLOCK_CHECK)
 #elif defined(STARPU_SPINLOCK_CHECK)
 	int ret = pthread_mutex_lock(&lock->errcheck_lock);
 	int ret = pthread_mutex_lock(&lock->errcheck_lock);
@@ -96,6 +97,8 @@ int _starpu_spin_lock(struct _starpu_spinlock *lock)
 	do
 	do
 	{
 	{
 		prev = STARPU_TEST_AND_SET(&lock->taken, 1);
 		prev = STARPU_TEST_AND_SET(&lock->taken, 1);
+		if (prev)
+			STARPU_UYIELD();
 	}
 	}
 	while (prev);
 	while (prev);
 	return 0;
 	return 0;

+ 49 - 1
src/common/utils.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,6 +31,54 @@
 #include <msg/msg.h>
 #include <msg/msg.h>
 #endif
 #endif
 
 
+#ifdef STARPU_HAVE_HELGRIND_H
+#include <valgrind/helgrind.h>
+#endif
+
+#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
+#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_LOCK_POST
+#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
+#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
+#endif
+#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
+#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
+#endif
+#ifndef DO_CREQ_v_WW
+#define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
+#endif
+#ifndef DO_CREQ_v_W
+#define DO_CREQ_v_W(_creqF, _ty1F, _arg1F) ((void)0)
+#endif
+#ifndef ANNOTATE_HAPPENS_BEFORE
+#define ANNOTATE_HAPPENS_BEFORE(obj) ((void)0)
+#endif
+#ifndef ANNOTATE_HAPPENS_AFTER
+#define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
+#endif
+#ifndef ANNOTATE_RWLOCK_ACQUIRED
+#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
+#endif
+#ifndef ANNOTATE_RWLOCK_RELEASED
+#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
+#endif
+
+#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
+	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
+			struct _starpu_spinlock *, lock, long, 0)
+#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
+			struct _starpu_spinlock *, lock)
+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
+			struct _starpu_spinlock *, lock)
+#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
+	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
+			struct _starpu_spinlock *, lock)
+
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
 #else
 #else

+ 4 - 1
src/core/dependencies/implicit_data_deps.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -307,6 +307,9 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	STARPU_ASSERT(task->cl);
 	STARPU_ASSERT(task->cl);
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
+	if (!task->sequential_consistency)
+		return;
+
 	/* We don't want to enforce a sequential consistency for tasks that are
 	/* We don't want to enforce a sequential consistency for tasks that are
 	 * not visible to the application. */
 	 * not visible to the application. */
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);

+ 12 - 5
src/core/dependencies/tags.c

@@ -178,10 +178,8 @@ void _starpu_tag_clear(void)
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 }
 }
 
 
-static struct _starpu_tag *gettag_struct(starpu_tag_t id)
+static struct _starpu_tag *_gettag_struct(starpu_tag_t id)
 {
 {
-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
-
 	/* search if the tag is already declared or not */
 	/* search if the tag is already declared or not */
 	struct _starpu_tag_table *entry;
 	struct _starpu_tag_table *entry;
 	struct _starpu_tag *tag;
 	struct _starpu_tag *tag;
@@ -212,8 +210,15 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 #endif
 #endif
 	}
 	}
 
 
-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
+	return tag;
+}
 
 
+static struct _starpu_tag *gettag_struct(starpu_tag_t id)
+{
+	struct _starpu_tag *tag;
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
+	tag = _gettag_struct(id);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 	return tag;
 	return tag;
 }
 }
 
 
@@ -432,10 +437,11 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 		return -EDEADLK;
 		return -EDEADLK;
 	}
 	}
 
 
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 	/* only wait the tags that are not done yet */
 	/* only wait the tags that are not done yet */
 	for (i = 0, current = 0; i < ntags; i++)
 	for (i = 0, current = 0; i < ntags; i++)
 	{
 	{
-		struct _starpu_tag *tag = gettag_struct(id[i]);
+		struct _starpu_tag *tag = _gettag_struct(id[i]);
 
 
 		_starpu_spin_lock(&tag->lock);
 		_starpu_spin_lock(&tag->lock);
 
 
@@ -450,6 +456,7 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 			current++;
 			current++;
 		}
 		}
 	}
 	}
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 
 
 	if (current == 0)
 	if (current == 0)
 	{
 	{

+ 12 - 0
src/core/perfmodel/perfmodel_history.c

@@ -942,6 +942,7 @@ int starpu_perfmodel_list(FILE *output)
 
 
 /* This function is intended to be used by external tools that should read the
 /* This function is intended to be used by external tools that should read the
  * performance model files */
  * performance model files */
+/* TODO: write an clear function, to free symbol and history */
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 {
 {
 	model->symbol = strdup(symbol);
 	model->symbol = strdup(symbol);
@@ -1064,6 +1065,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 		HASH_FIND_UINT32_T(history, &key, entry);
 		HASH_FIND_UINT32_T(history, &key, entry);
 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
 
+		/* We do not care about racing access to the mean, we only want a
+		 * good-enough estimation, thus simulate taking the rdlock */
+		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
+
 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
 			exp = entry->history_entry->mean;
 			exp = entry->history_entry->mean;
 		else if (!model->benchmarking)
 		else if (!model->benchmarking)
@@ -1075,6 +1080,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			_starpu_set_calibrate_flag(1);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 			model->benchmarking = 1;
 		}
 		}
+		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
 	}
 	}
 
 
 	return exp;
 	return exp;
@@ -1097,6 +1103,10 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	entry = (elt == NULL) ? NULL : elt->history_entry;
 	entry = (elt == NULL) ? NULL : elt->history_entry;
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
 
+	/* We do not care about racing access to the mean, we only want a
+	 * good-enough estimation, thus simulate taking the rdlock */
+	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
+
 	exp = entry?entry->mean:NAN;
 	exp = entry?entry->mean:NAN;
 
 
 	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
 	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
@@ -1115,6 +1125,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 		model->benchmarking = 1;
 		model->benchmarking = 1;
 	}
 	}
 
 
+	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
+
 	return exp;
 	return exp;
 }
 }
 
 

+ 1 - 1
src/core/sched_ctx.c

@@ -490,7 +490,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	unsigned nworkers = config->topology.nworkers;
 	unsigned nworkers = config->topology.nworkers;
 
 
-	if(nworkers_ctx > 0 && inheritor_sched_ctx_id != STARPU_NMAX_SCHED_CTXS && 
+	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
 	   !(nworkers_ctx == nworkers && nworkers_ctx == inheritor_sched_ctx->workers->nworkers))
 	{
 	{
 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);

+ 8 - 0
src/core/task.c

@@ -57,6 +57,8 @@ void starpu_task_init(struct starpu_task *task)
 	 * everywhere */
 	 * everywhere */
 	memset(task, 0, sizeof(struct starpu_task));
 	memset(task, 0, sizeof(struct starpu_task));
 
 
+	task->sequential_consistency = 1;
+
 	/* Now we can initialise fields which recquire custom value */
 	/* Now we can initialise fields which recquire custom value */
 #if STARPU_DEFAULT_PRIO != 0
 #if STARPU_DEFAULT_PRIO != 0
 	task->priority = STARPU_DEFAULT_PRIO;
 	task->priority = STARPU_DEFAULT_PRIO;
@@ -707,7 +709,11 @@ void _starpu_decrement_nsubmitted_tasks(void)
 	if (--nsubmitted == 0)
 	if (--nsubmitted == 0)
 	{
 	{
 		if (!config->submitting)
 		if (!config->submitting)
+		{
+			ANNOTATE_HAPPENS_AFTER(&config->running);
 			config->running = 0;
 			config->running = 0;
+			ANNOTATE_HAPPENS_BEFORE(&config->running);
+		}
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 	}
 	}
 
 
@@ -727,7 +733,9 @@ starpu_drivers_request_termination(void)
 	config->submitting = 0;
 	config->submitting = 0;
 	if (nsubmitted == 0)
 	if (nsubmitted == 0)
 	{
 	{
+		ANNOTATE_HAPPENS_AFTER(&config->running);
 		config->running = 0;
 		config->running = 0;
+		ANNOTATE_HAPPENS_BEFORE(&config->running);
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 	}
 	}
 
 

+ 7 - 1
src/core/workers.c

@@ -893,9 +893,13 @@ out:
 
 
 unsigned _starpu_machine_is_running(void)
 unsigned _starpu_machine_is_running(void)
 {
 {
+	unsigned ret;
 	/* running is just protected by a memory barrier */
 	/* running is just protected by a memory barrier */
 	STARPU_RMB();
 	STARPU_RMB();
-	return config.running;
+	ANNOTATE_HAPPENS_AFTER(&config.running);
+	ret = config.running;
+	ANNOTATE_HAPPENS_BEFORE(&config.running);
+	return ret;
 }
 }
 
 
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
@@ -923,8 +927,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 {
 {
 	/* set the flag which will tell workers to stop */
 	/* set the flag which will tell workers to stop */
+	ANNOTATE_HAPPENS_AFTER(&config.running);
 	pconfig->running = 0;
 	pconfig->running = 0;
 	/* running is just protected by a memory barrier */
 	/* running is just protected by a memory barrier */
+	ANNOTATE_HAPPENS_BEFORE(&config.running);
 	STARPU_WMB();
 	STARPU_WMB();
 	starpu_wake_all_blocked_workers();
 	starpu_wake_all_blocked_workers();
 }
 }

+ 11 - 0
src/datawizard/data_request.c

@@ -17,6 +17,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
+#include <common/utils.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
 
 
 /* requests that have not been treated at all */
 /* requests that have not been treated at all */
@@ -391,8 +392,18 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_data_requests;
 
 
+	/* Note: we here tell valgrind that list_empty (reading a pointer) is
+	 * as safe as if we had the lock held */
+	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
+	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
+	{
+		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
+		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 		return;
 		return;
+	}
+	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
+	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 
 
 	/* take all the entries from the request list */
 	/* take all the entries from the request list */
         _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
         _STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);

+ 3 - 1
src/datawizard/datawizard.c

@@ -26,12 +26,14 @@
 
 
 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 {
 {
-#ifdef STARPU_SIMGRID
 #if STARPU_DEVEL
 #if STARPU_DEVEL
 #warning FIXME
 #warning FIXME
 #endif
 #endif
+#ifdef STARPU_SIMGRID
 	MSG_process_sleep(0.000010);
 	MSG_process_sleep(0.000010);
 #endif
 #endif
+	STARPU_UYIELD();
+
 	/* in case some other driver requested data */
 	/* in case some other driver requested data */
 	_starpu_handle_pending_node_data_requests(memory_node);
 	_starpu_handle_pending_node_data_requests(memory_node);
 	_starpu_handle_node_data_requests(memory_node, may_alloc);
 	_starpu_handle_node_data_requests(memory_node, may_alloc);

+ 13 - 11
src/datawizard/filters.c

@@ -345,13 +345,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	/* still valid ? */
 	/* still valid ? */
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
+		struct _starpu_data_replicate *local;
 		/* until an issue is found the data is assumed to be valid */
 		/* until an issue is found the data is assumed to be valid */
 		unsigned isvalid = 1;
 		unsigned isvalid = 1;
 
 
 		for (child = 0; child < root_handle->nchildren; child++)
 		for (child = 0; child < root_handle->nchildren; child++)
 		{
 		{
 			starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
 			starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
-			struct _starpu_data_replicate *local = &child_handle->per_node[node];
+			local = &child_handle->per_node[node];
 
 
 			if (local->state == STARPU_INVALID)
 			if (local->state == STARPU_INVALID)
 			{
 			{
@@ -359,24 +360,21 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 				isvalid = 0;
 				isvalid = 0;
 			}
 			}
 
 
-			if (local->allocated && local->automatically_allocated)
-			{
+			if (local->mc && local->allocated && local->automatically_allocated)
 				/* free the child data copy in a lazy fashion */
 				/* free the child data copy in a lazy fashion */
-#ifdef STARPU_DEVEL
-#warning FIXME!! this needs access to the child interface, which was freed above!
-#endif
-				_starpu_request_mem_chunk_removal(child_handle, node, sizes[child]);
-			}
+				_starpu_request_mem_chunk_removal(child_handle, local, node, sizes[child]);
 		}
 		}
 
 
-		if (!root_handle->per_node[node].allocated)
+		local = &root_handle->per_node[node];
+
+		if (!local->allocated)
 			/* Even if we have all the bits, if we don't have the
 			/* Even if we have all the bits, if we don't have the
 			 * whole data, it's not valid */
 			 * whole data, it's not valid */
 			isvalid = 0;
 			isvalid = 0;
 
 
-		if (!isvalid && root_handle->per_node[node].allocated && root_handle->per_node[node].automatically_allocated)
+		if (!isvalid && local->mc && local->allocated && local->automatically_allocated)
 			/* free the data copy in a lazy fashion */
 			/* free the data copy in a lazy fashion */
-			_starpu_request_mem_chunk_removal(root_handle, node, _starpu_data_get_size(root_handle));
+			_starpu_request_mem_chunk_removal(root_handle, local, node, _starpu_data_get_size(root_handle));
 
 
 		/* if there was no invalid copy, the node still has a valid copy */
 		/* if there was no invalid copy, the node still has a valid copy */
 		still_valid[node] = isvalid;
 		still_valid[node] = isvalid;
@@ -400,6 +398,10 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
 		_starpu_spin_unlock(&child_handle->header_lock);
 		_starpu_spin_unlock(&child_handle->header_lock);
 		_starpu_spin_destroy(&child_handle->header_lock);
 		_starpu_spin_destroy(&child_handle->header_lock);
+
+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->busy_mutex);
+		_STARPU_PTHREAD_COND_DESTROY(&child_handle->busy_cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&child_handle->sequential_consistency_mutex);
 	}
 	}
 
 
 	/* there is no child anymore */
 	/* there is no child anymore */

+ 1 - 1
src/datawizard/interfaces/block_interface.c

@@ -315,7 +315,7 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t nz = block_interface->nz;
 	uint32_t nz = block_interface->nz;
 	size_t elemsize = block_interface->elemsize;
 	size_t elemsize = block_interface->elemsize;
 
 
-	starpu_free_on_node(node, block_interface->ptr, nx*ny*nz*elemsize);
+	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 51 - 10
src/datawizard/interfaces/data_interface.c

@@ -480,6 +480,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
 					_STARPU_PTHREAD_COND_WAIT(&arg.cond, &arg.mutex);
 				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
 				_STARPU_PTHREAD_MUTEX_UNLOCK(&arg.mutex);
 			}
 			}
+			_STARPU_PTHREAD_MUTEX_DESTROY(&arg.mutex);
+			_STARPU_PTHREAD_COND_DESTROY(&arg.cond);
 			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
 			_starpu_release_data_on_node(handle, 0, &handle->per_node[home_node]);
 		}
 		}
 
 
@@ -546,23 +548,49 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
 
 	/* Wait for all requests to finish (notably WT requests) */
 	/* Wait for all requests to finish (notably WT requests) */
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
-	while (handle->busy_count)
+	while (1) {
+		int busy;
+		/* Note: we here tell valgrind that reading busy_count is as
+		 * safe is if we had the lock held */
+		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
+		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
+		busy = handle->busy_count;
+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
+		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
+		if (!busy)
+			break;
+		/* This is woken by _starpu_data_check_not_busy, always called
+		 * after decrementing busy_count */
 		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
 		_STARPU_PTHREAD_COND_WAIT(&handle->busy_cond, &handle->busy_mutex);
+	}
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
 
 
 	/* Wait for finished requests to release the handle */
 	/* Wait for finished requests to release the handle */
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
 
 
+	size_t size = _starpu_data_get_size(handle);
+
+	_starpu_data_free_interfaces(handle);
+
 	/* Destroy the data now */
 	/* Destroy the data now */
 	unsigned node;
 	unsigned node;
-	size_t size = _starpu_data_get_size(handle);
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
+		struct _starpu_data_replicate *local = &handle->per_node[node];
 		/* free the data copy in a lazy fashion */
 		/* free the data copy in a lazy fashion */
-		_starpu_request_mem_chunk_removal(handle, node, size);
+		if (local->allocated && local->automatically_allocated)
+			_starpu_request_mem_chunk_removal(handle, local, node, size);
+	}
+	unsigned worker;
+	unsigned nworkers = starpu_worker_get_count();
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
+		/* free the data copy in a lazy fashion */
+		if (local->allocated && local->automatically_allocated)
+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 	}
 	}
 
 
-	_starpu_data_free_interfaces(handle);
 	_starpu_memory_stats_free(handle);
 	_starpu_memory_stats_free(handle);
 	_starpu_data_requester_list_delete(handle->req_list);
 	_starpu_data_requester_list_delete(handle->req_list);
 	_starpu_data_requester_list_delete(handle->reduction_req_list);
 	_starpu_data_requester_list_delete(handle->reduction_req_list);
@@ -570,6 +598,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_destroy(&handle->header_lock);
 	_starpu_spin_destroy(&handle->header_lock);
 
 
+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->busy_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
+
 	free(handle);
 	free(handle);
 }
 }
 
 
@@ -604,13 +636,22 @@ static void _starpu_data_invalidate(void *data)
 	{
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
 		struct _starpu_data_replicate *local = &handle->per_node[node];
 
 
-		if (local->allocated && local->automatically_allocated)
-		{
+		if (local->mc && local->allocated && local->automatically_allocated)
 			/* free the data copy in a lazy fashion */
 			/* free the data copy in a lazy fashion */
-			_starpu_request_mem_chunk_removal(handle, node, size);
-			local->allocated = 0;
-			local->automatically_allocated = 0;
-		}
+			_starpu_request_mem_chunk_removal(handle, local, node, size);
+
+		local->state = STARPU_INVALID;
+	}
+
+	unsigned worker;
+	unsigned nworkers = starpu_worker_get_count();
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_worker[worker];
+
+		if (local->mc && local->allocated && local->automatically_allocated)
+			/* free the data copy in a lazy fashion */
+			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 
 
 		local->state = STARPU_INVALID;
 		local->state = STARPU_INVALID;
 	}
 	}

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -291,7 +291,7 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t ny = matrix_interface->ny;
 	uint32_t ny = matrix_interface->ny;
 	size_t elemsize = matrix_interface->elemsize;
 	size_t elemsize = matrix_interface->elemsize;
 
 
-	starpu_free_on_node(node, matrix_interface->ptr, nx*ny*elemsize);
+	starpu_free_on_node(node, matrix_interface->dev_handle, nx*ny*elemsize);
 }
 }
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 1 - 1
src/datawizard/interfaces/vector_interface.c

@@ -212,7 +212,7 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 	uint32_t nx = vector_interface->nx;
 	uint32_t nx = vector_interface->nx;
 	size_t elemsize = vector_interface->elemsize;
 	size_t elemsize = vector_interface->elemsize;
 
 
-	starpu_free_on_node(node, vector_interface->ptr, nx*elemsize);
+	starpu_free_on_node(node, vector_interface->dev_handle, nx*elemsize);
 }
 }
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node,
 static int copy_any_to_any(void *src_interface, unsigned src_node,

+ 10 - 8
src/datawizard/malloc.c

@@ -251,11 +251,6 @@ static struct starpu_codelet free_pinned_cl =
 
 
 int starpu_free_flags(void *A, size_t dim, int flags)
 int starpu_free_flags(void *A, size_t dim, int flags)
 {
 {
-	if (flags & STARPU_MALLOC_COUNT)
-	{
-		_starpu_memory_manager_deallocate_size(dim, 0);
-	}
-
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_PINNED)
 	if (flags & STARPU_MALLOC_PINNED)
 	{
 	{
@@ -272,7 +267,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 				cudaError_t err = cudaFreeHost(A);
 				cudaError_t err = cudaFreeHost(A);
 				if (STARPU_UNLIKELY(err))
 				if (STARPU_UNLIKELY(err))
 					STARPU_CUDA_REPORT_ERROR(err);
 					STARPU_CUDA_REPORT_ERROR(err);
-				return 0;
+				goto out;
 #ifndef HAVE_CUDA_MEMCPY_PEER
 #ifndef HAVE_CUDA_MEMCPY_PEER
 			}
 			}
 			else
 			else
@@ -293,7 +288,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
 
 				push_res = _starpu_task_submit_internally(task);
 				push_res = _starpu_task_submit_internally(task);
 				STARPU_ASSERT(push_res != -ENODEV);
 				STARPU_ASSERT(push_res != -ENODEV);
-				return 0;
+				goto out;
 			}
 			}
 #endif /* HAVE_CUDA_MEMCPY_PEER */
 #endif /* HAVE_CUDA_MEMCPY_PEER */
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
@@ -317,13 +312,20 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 //
 //
 //		push_res = starpu_task_submit(task);
 //		push_res = starpu_task_submit(task);
 //		STARPU_ASSERT(push_res != -ENODEV);
 //		STARPU_ASSERT(push_res != -ENODEV);
-//		return 0;
+//		goto out;
 //	}
 //	}
 //#endif
 //#endif
 	}
 	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
 	free(A);
 	free(A);
+
+out:
+	if (flags & STARPU_MALLOC_COUNT)
+	{
+		_starpu_memory_manager_deallocate_size(dim, 0);
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 31 - 43
src/datawizard/memalloc.c

@@ -62,6 +62,8 @@ void _starpu_deinit_mem_chunk_lists(void)
 		_starpu_mem_chunk_list_delete(mc_list[i]);
 		_starpu_mem_chunk_list_delete(mc_list[i]);
 		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
 		_starpu_mem_chunk_list_delete(memchunk_cache[i]);
 		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
 		_starpu_mem_chunk_lru_list_delete(starpu_lru_list[i]);
+		_starpu_spin_destroy(&lru_rwlock[i]);
+		_STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
 	}
 	}
 }
 }
 
 
@@ -694,59 +696,45 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
  * unregister or unpartition). It puts all the memchunks that refer to the
  * unregister or unpartition). It puts all the memchunks that refer to the
  * specified handle into the cache.
  * specified handle into the cache.
  */
  */
-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size)
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size)
 {
 {
-	_starpu_spin_checklocked(&handle->header_lock);
+	struct _starpu_mem_chunk *mc = replicate->mc;
 
 
-	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	STARPU_ASSERT(mc->data == handle);
 
 
-	/* TODO: expensive, handle should have its own list of chunks? */
-	/* iterate over the list of memory chunks and remove the entry */
-	struct _starpu_mem_chunk *mc, *next_mc;
-	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
-	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
-	     mc = next_mc)
-	{
-		next_mc = _starpu_mem_chunk_list_next(mc);
+	/* Record the allocated size, so that later in memory
+	 * reclaiming we can estimate how much memory we free
+	 * by freeing this.  */
+	mc->size = size;
 
 
-		if (mc->data == handle)
-		{
-			/* we found the data */
+	/* This memchunk doesn't have to do with the data any more. */
+	replicate->mc = NULL;
+	replicate->allocated = 0;
+	replicate->automatically_allocated = 0;
 
 
-			/* Record the allocated size, so that later in memory
-			 * reclaiming we can estimate how much memory we free
-			 * by freeing this.  */
-			mc->size = size;
-			/* This memchunk doesn't have to do with the data any more. */
-			mc->data = NULL;
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 
 
-			/* remove it from the main list */
-			_starpu_mem_chunk_list_erase(mc_list[node], mc);
+	mc->data = NULL;
+	/* remove it from the main list */
+	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
 
-			/* We would never flush the node 0 cache, unless
-			 * malloc() returns NULL, which is very unlikely... */
-			/* This is particularly important when
-			 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
-			 * wouldn't even re-use these allocations! */
-			if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
-			{
-				free_memory_on_node(mc, node);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 
 
-				free(mc->chunk_interface);
-				_starpu_mem_chunk_delete(mc);
-			}
-			else
-				/* put it in the list of buffers to be removed */
-				_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
+	/* We would never flush the node 0 cache, unless
+	 * malloc() returns NULL, which is very unlikely... */
+	/* This is particularly important when
+	 * STARPU_USE_ALLOCATION_CACHE is not enabled, as we
+	 * wouldn't even re-use these allocations! */
+	if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
+	{
+		free_memory_on_node(mc, node);
 
 
-			/* Note that we do not stop here because there can be
-			 * multiple replicates associated to the same handle on
-			 * the same memory node.  */
-		}
+		free(mc->chunk_interface);
+		_starpu_mem_chunk_delete(mc);
 	}
 	}
-
-	/* there was no corresponding buffer ... */
-	_STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	else
+		/* put it in the list of buffers to be removed */
+		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
 }
 }
 
 
 /*
 /*

+ 1 - 1
src/datawizard/memalloc.h

@@ -62,7 +62,7 @@ LIST_TYPE(_starpu_mem_chunk_lru,
 
 
 void _starpu_init_mem_chunk_lists(void);
 void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
-void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, unsigned node, size_t size);
+void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);

+ 1 - 0
src/datawizard/memory_nodes.c

@@ -55,6 +55,7 @@ void _starpu_memory_nodes_deinit(void)
 	_starpu_deinit_data_request_lists();
 	_starpu_deinit_data_request_lists();
 	_starpu_deinit_mem_chunk_lists();
 	_starpu_deinit_mem_chunk_lists();
 
 
+	_STARPU_PTHREAD_RWLOCK_DESTROY(&descr.conditions_rwlock);
 	_STARPU_PTHREAD_KEY_DELETE(memory_node_key);
 	_STARPU_PTHREAD_KEY_DELETE(memory_node_key);
 }
 }
 
 

+ 10 - 4
src/datawizard/user_interactions.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -239,11 +239,12 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		.handle = handle,
 		.handle = handle,
 		.mode = mode,
 		.mode = mode,
 		.node = node,
 		.node = node,
-		.cond = _STARPU_PTHREAD_COND_INITIALIZER,
-		.lock = _STARPU_PTHREAD_MUTEX_INITIALIZER,
 		.finished = 0
 		.finished = 0
 	};
 	};
 
 
+	_STARPU_PTHREAD_COND_INIT(&wrapper.cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&wrapper.lock, NULL);
+
 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
 //	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
 	int sequential_consistency = handle->sequential_consistency;
@@ -297,8 +298,9 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		while (!wrapper.finished)
 		while (!wrapper.finished)
 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
-		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
 	}
 	}
+	_STARPU_PTHREAD_COND_DESTROY(&wrapper.cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
 
 
 	/* At that moment, the caller holds a reference to the piece of data.
 	/* At that moment, the caller holds a reference to the piece of data.
 	 * We enqueue the "post" sync task in the list associated to the handle
 	 * We enqueue the "post" sync task in the list associated to the handle
@@ -381,6 +383,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* we can immediately proceed */
 		/* we can immediately proceed */
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 
 
+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
 		free(wrapper);
 		free(wrapper);
 
 
 		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
 		_starpu_fetch_data_on_node(handle, replicate, mode, async, async, NULL, NULL);
@@ -410,6 +414,8 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		while (!wrapper->finished)
 		while (!wrapper->finished)
 			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
 			_STARPU_PTHREAD_COND_WAIT(&wrapper->cond, &wrapper->lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper->lock);
+		_STARPU_PTHREAD_COND_DESTROY(&wrapper->cond);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper->lock);
 		free(wrapper);
 		free(wrapper);
 	}
 	}
 
 

+ 4 - 2
src/debug/traces/starpu_fxt.c

@@ -86,7 +86,7 @@ static unsigned get_colour_symbol_blue(char *name)
 }
 }
 
 
 static double last_codelet_start[STARPU_NMAXWORKERS];
 static double last_codelet_start[STARPU_NMAXWORKERS];
-static char last_codelet_symbol[128][STARPU_NMAXWORKERS];
+static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
 
 
 /* If more than a period of time has elapsed, we flush the profiling info,
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -144,6 +144,8 @@ static void register_worker_id(unsigned long tid, int workerid)
 
 
 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
 
 
+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase the maximum number of CPUs and GPUs to the same value as was used for execution");
+
 	/* only register a thread once */
 	/* only register a thread once */
 	STARPU_ASSERT(entry == NULL);
 	STARPU_ASSERT(entry == NULL);
 
 
@@ -506,7 +508,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	unsigned long has_name = ev->param[3];
 	unsigned long has_name = ev->param[3];
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
 
-	snprintf(last_codelet_symbol[worker], 128, "%s", name);
+	snprintf(last_codelet_symbol[worker], sizeof(last_codelet_symbol[worker]), "%s", name);
 
 
 	double start_codelet_time = get_event_time_stamp(ev, options);
 	double start_codelet_time = get_event_time_stamp(ev, options);
 	last_codelet_start[worker] = start_codelet_time;
 	last_codelet_start[worker] = start_codelet_time;

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -192,7 +192,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
 {
 {
 	size_t global_mem;
 	size_t global_mem;
-	int limit;
+	ssize_t limit;
 
 
 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL

+ 3 - 3
src/drivers/cuda/driver_cuda.c

@@ -73,7 +73,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
  */
  */
 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 {
 {
-	int limit;
+	ssize_t limit;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	char name[30];
 	char name[30];
@@ -101,8 +101,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 	props[devid].totalGlobalMem -= to_waste;
 	props[devid].totalGlobalMem -= to_waste;
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
 
 
-	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
-			devid, (long) to_waste/(1024*1024), limit, (long) totalGlobalMem/(1024*1024),
+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
 			(long) (totalGlobalMem - to_waste)/(1024*1024));
 			(long) (totalGlobalMem - to_waste)/(1024*1024));
 }
 }
 
 

+ 3 - 2
src/drivers/driver_common/driver_common.c

@@ -174,11 +174,12 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 
 		if (_starpu_worker_can_block(memnode))
 		if (_starpu_worker_can_block(memnode))
 			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
 			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
-#ifdef STARPU_SIMGRID
 		else
 		else
 		{
 		{
 			if (_starpu_machine_is_running())
 			if (_starpu_machine_is_running())
 			{
 			{
+				STARPU_UYIELD();
+#ifdef STARPU_SIMGRID
 				static int warned;
 				static int warned;
 				if (!warned)
 				if (!warned)
 				{
 				{
@@ -186,9 +187,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 					_STARPU_DISP("Has to make simgrid spin for progression hooks\n");
 					_STARPU_DISP("Has to make simgrid spin for progression hooks\n");
 				}
 				}
 				MSG_process_sleep(0.000010);
 				MSG_process_sleep(0.000010);
+#endif
 			}
 			}
 		}
 		}
-#endif
 
 
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
 

+ 12 - 4
src/drivers/opencl/driver_opencl.c

@@ -61,7 +61,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
 
 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 {
 {
-	int limit;
+	ssize_t limit;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
 	char name[30];
 	char name[30];
@@ -90,9 +90,9 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 	to_waste = totalGlobalMem - global_mem[devid];
 	to_waste = totalGlobalMem - global_mem[devid];
 #endif
 #endif
 
 
-	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
-                      devid, (size_t)to_waste/(1024*1024), limit, (size_t)totalGlobalMem/(1024*1024),
-                      (size_t)(totalGlobalMem - to_waste)/(1024*1024));
+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
+			(long)(totalGlobalMem - to_waste)/(1024*1024));
 
 
 }
 }
 
 
@@ -701,11 +701,19 @@ int _starpu_opencl_driver_deinit(struct starpu_driver *d)
 	unsigned memnode = args->memory_node;
 	unsigned memnode = args->memory_node;
 
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
 	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	unsigned devid   = args->devid;
 	unsigned devid   = args->devid;
         _starpu_opencl_deinit_context(devid);
         _starpu_opencl_deinit_context(devid);
 #endif
 #endif
 
 
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 25 - 15
src/profiling/profiling.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,8 +68,10 @@ int _starpu_profiling =
 
 
 int starpu_profiling_status_set(int status)
 int starpu_profiling_status_set(int status)
 {
 {
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
 	int prev_value = _starpu_profiling;
 	int prev_value = _starpu_profiling;
 	_starpu_profiling = status;
 	_starpu_profiling = status;
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
 
 
 	_STARPU_TRACE_SET_PROFILING(status);
 	_STARPU_TRACE_SET_PROFILING(status);
 
 
@@ -94,12 +96,6 @@ int starpu_profiling_status_set(int status)
 	return prev_value;
 	return prev_value;
 }
 }
 
 
-#undef starpu_profiling_status_get
-int starpu_profiling_status_get(void)
-{
-	return _starpu_profiling;
-}
-
 void _starpu_profiling_init(void)
 void _starpu_profiling_init(void)
 {
 {
 	int worker;
 	int worker;
@@ -110,7 +106,11 @@ void _starpu_profiling_init(void)
 		_starpu_worker_reset_profiling_info(worker);
 		_starpu_worker_reset_profiling_info(worker);
 	}
 	}
 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
+	{
+		ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
 		_starpu_profiling = STARPU_PROFILING_ENABLE;
 		_starpu_profiling = STARPU_PROFILING_ENABLE;
+		ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
+	}
 }
 }
 
 
 void _starpu_profiling_terminate(void)
 void _starpu_profiling_terminate(void)
@@ -127,7 +127,7 @@ struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(str
 	struct starpu_task_profiling_info *info = NULL;
 	struct starpu_task_profiling_info *info = NULL;
 
 
 	/* If we are benchmarking, we need room for the power consumption */
 	/* If we are benchmarking, we need room for the power consumption */
-	if (_starpu_profiling || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
+	if (starpu_profiling_status_get() || (task->cl && task->cl->power_model && (task->cl->power_model->benchmarking || _starpu_get_calibrate_flag())))
 	{
 	{
 		info = (struct starpu_task_profiling_info *) calloc(1, sizeof(struct starpu_task_profiling_info));
 		info = (struct starpu_task_profiling_info *) calloc(1, sizeof(struct starpu_task_profiling_info));
 		STARPU_ASSERT(info);
 		STARPU_ASSERT(info);
@@ -191,7 +191,7 @@ void _starpu_worker_reset_profiling_info(int workerid)
 
 
 void _starpu_worker_restart_sleeping(int workerid)
 void _starpu_worker_restart_sleeping(int workerid)
 {
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 	{
 		struct timespec sleep_start_time;
 		struct timespec sleep_start_time;
 		_starpu_clock_gettime(&sleep_start_time);
 		_starpu_clock_gettime(&sleep_start_time);
@@ -205,7 +205,7 @@ void _starpu_worker_restart_sleeping(int workerid)
 
 
 void _starpu_worker_stop_sleeping(int workerid)
 void _starpu_worker_stop_sleeping(int workerid)
 {
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 	{
 		struct timespec *sleeping_start, sleep_end_time;
 		struct timespec *sleeping_start, sleep_end_time;
 
 
@@ -240,7 +240,7 @@ void _starpu_worker_stop_sleeping(int workerid)
 
 
 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start)
 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start)
 {
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 		worker_registered_executing_start[workerid] = 1;
 		worker_registered_executing_start[workerid] = 1;
@@ -252,7 +252,7 @@ void _starpu_worker_register_executing_start_date(int workerid, struct timespec
 
 
 void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
 void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
 {
 {
-	if (_starpu_profiling)
+	if (starpu_profiling_status_get())
 	{
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 		_STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 
 
@@ -272,7 +272,7 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
 
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
 {
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 	{
 	{
 		/* Not thread safe, shouldn't be too much a problem */
 		/* Not thread safe, shouldn't be too much a problem */
 		info->executed_tasks = worker_info[workerid].executed_tasks;
 		info->executed_tasks = worker_info[workerid].executed_tasks;
@@ -319,7 +319,7 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 /* When did the task reach the scheduler  ? */
 /* When did the task reach the scheduler  ? */
 void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 {
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 		return;
 		return;
 
 
 	struct starpu_task_profiling_info *profiling_info;
 	struct starpu_task_profiling_info *profiling_info;
@@ -331,7 +331,7 @@ void _starpu_profiling_set_task_push_start_time(struct starpu_task *task)
 
 
 void _starpu_profiling_set_task_push_end_time(struct starpu_task *task)
 void _starpu_profiling_set_task_push_end_time(struct starpu_task *task)
 {
 {
-	if (!_starpu_profiling)
+	if (!starpu_profiling_status_get())
 		return;
 		return;
 
 
 	struct starpu_task_profiling_info *profiling_info;
 	struct starpu_task_profiling_info *profiling_info;
@@ -429,3 +429,13 @@ void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
 	bus_profiling_info[src_node][dst_node].transfer_count++;
 	bus_profiling_info[src_node][dst_node].transfer_count++;
 //	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
 //	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
 }
 }
+
+#undef starpu_profiling_status_get
+int starpu_profiling_status_get(void)
+{
+	int ret;
+	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
+	ret = _starpu_profiling;
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_profiling);
+	return ret;
+}

+ 1 - 0
tests/Makefile.am

@@ -103,6 +103,7 @@ noinst_PROGRAMS =				\
 	main/deprecated_buffer			\
 	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/init_run_deinit         \
 	main/driver_api/run_driver              \
 	main/driver_api/run_driver              \
+	main/deploop                            \
 	main/restart				\
 	main/restart				\
 	main/execute_on_a_specific_worker	\
 	main/execute_on_a_specific_worker	\
 	main/insert_task			\
 	main/insert_task			\

+ 1 - 1
tests/loader.c

@@ -120,7 +120,7 @@ static void test_cleaner(int sig)
 	fprintf(stderr, "[error] test %s has been blocked for %d seconds. Mark it as failed\n", test_name, timeout);
 	fprintf(stderr, "[error] test %s has been blocked for %d seconds. Mark it as failed\n", test_name, timeout);
 	child_gid = getpgid(child_pid);
 	child_gid = getpgid(child_pid);
 	launch_gdb(test_name);
 	launch_gdb(test_name);
-	kill(-child_gid, SIGKILL);
+	kill(-child_gid, SIGQUIT);
 	exit(EXIT_FAILURE);
 	exit(EXIT_FAILURE);
 }
 }
 
 

+ 92 - 0
tests/main/deploop.c

@@ -0,0 +1,92 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Create task A and B such that
+ * - B depends on A by tag dependency.
+ * - A would depend on B by data dependency, but we disable that.
+ */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include "../helper.h"
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	FPRINTF(stderr,"executing task %p\n", starpu_task_get_current());
+}
+
+static struct starpu_codelet dummy_codelet = 
+{
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 1,
+	.modes = { STARPU_RW }
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+	starpu_data_handle_t handle;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_void_data_register(&handle);
+
+	struct starpu_task *taskA, *taskB;
+
+	/* Make B depend on A */
+	starpu_tag_declare_deps(1, 1, (starpu_tag_t) 0);
+
+	taskA = starpu_task_create();
+	taskA->cl = &dummy_codelet;
+	taskA->tag_id = 0;
+	taskA->use_tag = 1;
+	taskA->handles[0] = handle;
+	taskA->sequential_consistency = 0;
+	FPRINTF(stderr,"A is %p\n", taskA);
+
+	taskB = starpu_task_create();
+	taskB->cl = &dummy_codelet;
+	taskB->tag_id = 1;
+	taskB->use_tag = 1;
+	taskB->handles[0] = handle;
+	FPRINTF(stderr,"B is %p\n", taskB);
+
+	ret = starpu_task_submit(taskB);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+	ret = starpu_task_submit(taskA);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(handle);
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 1 - 1
tests/microbenchs/tasks_overhead.c

@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
 	gettimeofday(&end_exec, NULL);
 	gettimeofday(&end_exec, NULL);
 
 
-	for (i = 1; i < ntasks; i++)
+	for (i = 0; i < ntasks; i++)
 		starpu_task_clean(&tasks[i]);
 		starpu_task_clean(&tasks[i]);
 
 
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	for (buffer = 0; buffer < nbuffers; buffer++)

+ 19 - 30
tools/valgrind/starpu.suppr

@@ -1,18 +1,4 @@
 {
 {
-   config.running is not racy from starpu_shutdown
-   Helgrind:Race
-   fun:starpu_shutdown
-   ...
-}
-
-{
-   config.running is not racy from _starpu_machine_is_running
-   Helgrind:Race
-   fun:_starpu_machine_is_running
-   ...
-}
-
-{
    don't care about cache hit stats
    don't care about cache hit stats
    Helgrind:Race
    Helgrind:Race
    fun:_starpu_msi_cache_hit
    fun:_starpu_msi_cache_hit
@@ -48,37 +34,31 @@
 }
 }
 
 
 {
 {
-   We do not care about the race on the entry->mean variable, we only want a good-enough estimation.
-   Helgrind:Race
-   fun: _starpu_history_based_job_expected_perf
-   ...
-}
-
-{
    We do not care about races on profiling statistics
    We do not care about races on profiling statistics
    Helgrind:Race
    Helgrind:Race
-   fun: starpu_profiling_status_get
+   fun:_starpu_worker_get_status
+   fun:_starpu_worker_reset_profiling_info_with_lock
    ...
    ...
 }
 }
 
 
 {
 {
    This is racy, but since we'll always put the same values, this is not a problem.
    This is racy, but since we'll always put the same values, this is not a problem.
    Helgrind:Race
    Helgrind:Race
-   fun: _starpu_codelet_check_deprecated_fields
+   fun:_starpu_codelet_check_deprecated_fields
    ...
    ...
 }
 }
 
 
 {
 {
    This is racy, but we don't care, it's only a statistic
    This is racy, but we don't care, it's only a statistic
    Helgrind:Race
    Helgrind:Race
-   fun: starpu_task_nsubmitted
+   fun:starpu_task_nsubmitted
    ...
    ...
 }
 }
 
 
 {
 {
    This is racy, but we don't care, it's only a statistic
    This is racy, but we don't care, it's only a statistic
    Helgrind:Race
    Helgrind:Race
-   fun: starpu_task_nready
+   fun:starpu_task_nready
    ...
    ...
 }
 }
 
 
@@ -92,18 +72,27 @@
 }
 }
 
 
 {
 {
-   This is racy, but we don't care, if the function was called a bit earlier we would have had a different value
+   This is racy, but keep it away for now, otherwise it clutters the buildbot log
    Helgrind:Race
    Helgrind:Race
-   fun: _starpu_fifo_empty
-   fun: pop_task_eager_policy
+   fun:_starpu_fifo_empty
+   fun:pop_task_eager_policy
    ...
    ...
 }
 }
 
 
 {
 {
    This is the counterpart of the suppression above
    This is the counterpart of the suppression above
    Helgrind:Race
    Helgrind:Race
-   fun: _starpu_fifo_push_task
-   fun: push_task_eager_policy
+   fun:_starpu_fifo_push_task
+   fun:push_task_eager_policy
    ...
    ...
 }
 }
 
 
+
+{
+   This is the counterpart of the suppression above
+   Helgrind:Race
+   fun:_starpu_fifo_push_sorted_task
+   fun:_starpu_fifo_push_task
+   fun:push_task_eager_policy
+   ...
+}