Browse Source

Merge remote-tracking branch 'gitlab/master' into ft_checkpoint

Romain LION 4 years ago
parent
commit
324cffe7ad
53 changed files with 700 additions and 231 deletions
  1. 1 0
      ChangeLog
  2. 2 0
      configure.ac
  3. 1 0
      examples/Makefile.am
  4. 11 0
      examples/api/bcsr_data_interface.c
  5. 9 0
      examples/api/block_data_interface.c
  6. 2 0
      examples/api/coo_data_interface.c
  7. 8 0
      examples/api/csr_data_interface.c
  8. 10 0
      examples/api/matrix_data_interface.c
  9. 2 0
      examples/api/multiformat_data_interface.c
  10. 37 0
      examples/api/tensor_data_interface.c
  11. 5 0
      examples/api/variable_data_interface.c
  12. 8 0
      examples/api/vector_data_interface.c
  13. 2 0
      examples/api/void_data_interface.c
  14. 5 0
      examples/filters/fblock.c
  15. 2 2
      examples/filters/fblock_opencl.c
  16. 13 10
      examples/filters/fblock_opencl_kernel.cl
  17. 19 1
      include/starpu_data.h
  18. 3 1
      mpi/examples/Makefile.am
  19. 11 0
      mpi/examples/mpi_lu/plu_example.c
  20. 11 0
      mpi/examples/mpi_lu/plu_implicit_example.c
  21. 2 4
      mpi/src/mpi/starpu_mpi_mpi.c
  22. 2 2
      mpi/src/starpu_mpi_coop_sends.c
  23. 1 1
      mpi/src/starpu_mpi_fxt.h
  24. 4 16
      src/core/dependencies/cg.c
  25. 31 9
      src/core/dependencies/data_arbiter_concurrency.c
  26. 29 11
      src/core/dependencies/data_concurrency.c
  27. 2 2
      src/core/dependencies/data_concurrency.h
  28. 10 3
      src/core/dependencies/implicit_data_deps.c
  29. 1 1
      src/core/dependencies/implicit_data_deps.h
  30. 16 11
      src/core/dependencies/tags.c
  31. 2 0
      src/core/dependencies/tags.h
  32. 1 1
      src/core/jobs.c
  33. 13 10
      src/datawizard/coherency.c
  34. 1 0
      src/datawizard/coherency.h
  35. 4 4
      src/datawizard/copy_driver.c
  36. 2 2
      src/datawizard/interfaces/data_interface.c
  37. 28 8
      src/datawizard/user_interactions.c
  38. 1 1
      src/datawizard/write_back.c
  39. 2 2
      src/debug/latency.c
  40. 4 2
      src/debug/traces/starpu_fxt.c
  41. 1 0
      src/debug/traces/starpu_fxt.h
  42. 4 2
      src/debug/traces/starpu_fxt_mpi.c
  43. 7 4
      src/debug/traces/starpu_paje.c
  44. 12 0
      tests/Makefile.am
  45. 214 0
      tests/datawizard/acquire_release_to.c
  46. 3 3
      tests/datawizard/interfaces/block/block_opencl.c
  47. 15 23
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  48. 1 1
      tests/datawizard/interfaces/tensor/tensor_interface.c
  49. 3 3
      tests/datawizard/interfaces/tensor/tensor_opencl.c
  50. 18 26
      tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
  51. 8 0
      tests/energy/energy_efficiency.c
  52. 1 0
      tests/overlap/overlap.c
  53. 95 65
      tools/gdbinit

+ 1 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
   * Add a task prefetch level, to improve retaining data in accelerators so we
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
   * Add starpu_data_dup_ro().
+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
 
 
 Small changes:
 Small changes:
   * Add a synthetic energy efficiency testcase.
   * Add a synthetic energy efficiency testcase.

+ 2 - 0
configure.ac

@@ -151,6 +151,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 	], [simgrid_lib_dir=no])
 	], [simgrid_lib_dir=no])
 
 
 if test x$enable_simgrid = xyes ; then
 if test x$enable_simgrid = xyes ; then
+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
+
    	if test -n "$SIMGRID_CFLAGS" ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"

+ 1 - 0
examples/Makefile.am

@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 	api/csr_data_interface			\
 	api/csr_data_interface			\
 	api/matrix_data_interface		\
 	api/matrix_data_interface		\
 	api/multiformat_data_interface		\
 	api/multiformat_data_interface		\
+	api/tensor_data_interface		\
 	api/variable_data_interface		\
 	api/variable_data_interface		\
 	api/vector_data_interface		\
 	api/vector_data_interface		\
 	api/void_data_interface
 	api/void_data_interface

+ 11 - 0
examples/api/bcsr_data_interface.c

@@ -17,6 +17,17 @@
 // This program checks that the implementation of the BCSR data
 // This program checks that the implementation of the BCSR data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 
 
 int main()
 int main()

+ 9 - 0
examples/api/block_data_interface.c

@@ -17,6 +17,15 @@
 // This program checks that the implementation of the block data
 // This program checks that the implementation of the block data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_block_ops my_starpu_interface_block_ops
+#define starpu_block_data_register my_starpu_block_data_register
+#define starpu_block_get_nx my_starpu_block_get_nx
+#define starpu_block_get_ny my_starpu_block_get_ny
+#define starpu_block_get_nz my_starpu_block_get_nz
+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
 #include "../../src/datawizard/interfaces/block_interface.c"
 #include "../../src/datawizard/interfaces/block_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/coo_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the COO data
 // This program checks that the implementation of the COO data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
+#define starpu_coo_data_register my_starpu_coo_data_register
 #include "../../src/datawizard/interfaces/coo_interface.c"
 #include "../../src/datawizard/interfaces/coo_interface.c"
 
 
 int main()
 int main()

+ 8 - 0
examples/api/csr_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the CSR data
 // This program checks that the implementation of the CSR data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
+#define starpu_csr_data_register my_starpu_csr_data_register
+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
 #include "../../src/datawizard/interfaces/csr_interface.c"
 #include "../../src/datawizard/interfaces/csr_interface.c"
 
 
 int main()
 int main()

+ 10 - 0
examples/api/matrix_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the matrix data
 // This program checks that the implementation of the matrix data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
+#define starpu_matrix_data_register my_starpu_matrix_data_register
+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/multiformat_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the multiformat data
 // This program checks that the implementation of the multiformat data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 
 
 int main()
 int main()

+ 37 - 0
examples/api/tensor_data_interface.c

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+// This program checks that the implementation of the tensor data
+// interface only uses StarPU's public API
+
+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
+#define starpu_tensor_data_register my_starpu_tensor_data_register
+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
+#include "../../src/datawizard/interfaces/tensor_interface.c"
+
+int main()
+{
+        return 0;
+}

+ 5 - 0
examples/api/variable_data_interface.c

@@ -17,6 +17,11 @@
 // This program checks that the implementation of the variable data
 // This program checks that the implementation of the variable data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
+#define starpu_variable_data_register my_starpu_variable_data_register
+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
 #include "../../src/datawizard/interfaces/variable_interface.c"
 #include "../../src/datawizard/interfaces/variable_interface.c"
 
 
 int main()
 int main()

+ 8 - 0
examples/api/vector_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the vector data
 // This program checks that the implementation of the vector data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
+#define starpu_vector_data_register my_starpu_vector_data_register
+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
+#define starpu_vector_get_nx my_starpu_vector_get_nx
+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
 #include "../../src/datawizard/interfaces/vector_interface.c"
 #include "../../src/datawizard/interfaces/vector_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/void_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the void data
 // This program checks that the implementation of the void data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_void_ops my_starpu_interface_void_ops
+#define starpu_void_data_register my_starpu_void_data_register
 #include "../../src/datawizard/interfaces/void_interface.c"
 #include "../../src/datawizard/interfaces/void_interface.c"
 
 
 int main()
 int main()

+ 5 - 0
examples/filters/fblock.c

@@ -169,6 +169,11 @@ int main(void)
         print_data(handle);
         print_data(handle);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle);
 
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
         /* Print result block */
         /* Print result block */
         FPRINTF(stderr, "OUT Block\n");
         FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);
         print_block(block, NX, NY, NZ, NX, NX*NY);

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 
 	{
 	{
-		size_t global=nx*ny*nz;
+		size_t global[3]={nx,ny,nz};
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	}
 	starpu_opencl_release_kernel(kernel);
 	starpu_opencl_release_kernel(kernel);

+ 13 - 10
examples/filters/fblock_opencl_kernel.cl

@@ -18,14 +18,17 @@
 
 
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
 {
-        int i, j, k;
+	const int idx = get_global_id(0);
-        block = (__global char *)block + offset;
+	const int idy = get_global_id(1);
-        for(k=0; k<nz ; k++)
+	const int idz = get_global_id(2);
-	{
+	if (idx >= nx)
-                for(j=0; j<ny ; j++)
+		return;
-		{
+	if (idy >= ny)
-                        for(i=0; i<nx ; i++)
+		return;
-                                block[(k*ldz)+(j*ldy)+i] = factor;
+	if (idz >= nz)
-                }
+		return;
-        }
+
+	block = (__global int*) ((__global char *)block + offset);
+	int i = idz*ldz + idy*ldy + idx;
+	block[i] = factor;
 }
 }

+ 19 - 1
include/starpu_data.h

@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
 
 /**
 /**
    Similar to starpu_data_release(), except that the data
    Similar to starpu_data_release(), except that the data
-   will be available on the given memory \p node instead of main memory.
+   was made available on the given memory \p node instead of main memory.
    The \p node parameter must be exactly the same as the corresponding \c
    The \p node parameter must be exactly the same as the corresponding \c
    starpu_data_acquire_on_node* call.
    starpu_data_acquire_on_node* call.
 */
 */
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 
 
+/**
+   Partly release the piece of data acquired by the application either by
+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
+   case this is equivalent to calling starpu_data_release().
+*/
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+
+/**
+   Similar to starpu_data_release_to(), except that the data
+   was made available on the given memory \p node instead of main memory.
+   The \p node parameter must be exactly the same as the corresponding \c
+   starpu_data_acquire_on_node* call.
+*/
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
+
 /** @} */
 /** @} */
 
 
 /**
 /**

+ 3 - 1
mpi/examples/Makefile.am

@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=	\
 starpu_mpi_EXAMPLES	+=	\
 	mpi_lu/plu_implicit_example_float	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 endif
 endif
 
 
 mpi_lu_plu_example_float_LDADD =	\
 mpi_lu_plu_example_float_LDADD =	\

+ 11 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 	int rank;
 	int rank;
 	int world_size;
 	int world_size;
 	int ret;
 	int ret;
+	unsigned i, j;
 
 
 	/*
 	/*
 	 *	Initialization
 	 *	Initialization
@@ -594,6 +595,16 @@ int main(int argc, char **argv)
 	/*
 	/*
 	 * 	Termination
 	 * 	Termination
 	 */
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
 
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

+ 11 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 	int rank;
 	int rank;
 	int world_size;
 	int world_size;
 	int ret;
 	int ret;
+	unsigned i, j;
 
 
 	starpu_srand48((long int)time(NULL));
 	starpu_srand48((long int)time(NULL));
 
 
@@ -376,6 +377,16 @@ int main(int argc, char **argv)
 	/*
 	/*
 	 * 	Termination
 	 * 	Termination
 	 */
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 2 - 4
mpi/src/mpi/starpu_mpi_mpi.c

@@ -257,7 +257,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			}
 			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			/* Case: no matching data has been received. Store the receive request as an early_request. */
@@ -957,7 +957,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	}
 	}
 
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
-	starpu_data_release(args->early_handle);
+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
 
 
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	/* XXX: note that we have already freed the registered buffer above. In
 	/* XXX: note that we have already freed the registered buffer above. In
@@ -1104,8 +1104,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 
-		starpu_wake_all_blocked_workers();
-
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);

+ 2 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 			/* We were last, release data */
 			/* We were last, release data */
 			free(coop_sends->reqs_array);
 			free(coop_sends->reqs_array);
 			free(coop_sends);
 			free(coop_sends);
-			starpu_data_release(req->data_handle);
+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 		}
 		}
 	}
 	}
 	else
 	else
 	{
 	{
 		/* Trivial request */
 		/* Trivial request */
-		starpu_data_release(req->data_handle);
+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 	}
 	}
 }
 }
 
 

+ 1 - 1
mpi/src/starpu_mpi_fxt.h

@@ -74,7 +74,7 @@ extern "C"
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\

+ 4 - 16
src/core/dependencies/cg.c

@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 	return n;
 	return n;
 }
 }
 
 
-/* Note: in case of a tag, it must be already locked */
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				struct _starpu_tag *tag;
 				struct _starpu_tag *tag;
 
 
 				tag = cg->succ.tag;
 				tag = cg->succ.tag;
+				_starpu_spin_lock(&tag->lock);
 				tag_successors = &tag->tag_successors;
 				tag_successors = &tag->tag_successors;
 
 
 				tag_successors->ndeps_completed++;
 				tag_successors->ndeps_completed++;
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				{
 				{
 					/* reset the counter so that we can reuse the completion group */
 					/* reset the counter so that we can reuse the completion group */
 					tag_successors->ndeps_completed = 0;
 					tag_successors->ndeps_completed = 0;
+					/* This releases the lock */
 					_starpu_tag_set_ready(tag);
 					_starpu_tag_set_ready(tag);
-				}
+				} else
+					_starpu_spin_unlock(&tag->lock);
 				break;
 				break;
 			}
 			}
 
 
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 			successors->nsuccs--;
 			successors->nsuccs--;
 		}
 		}
 		_starpu_spin_unlock(&successors->lock);
 		_starpu_spin_unlock(&successors->lock);
-
-		struct _starpu_tag *cgtag = NULL;
-
-		if (cg_type == STARPU_CG_TAG)
-		{
-			cgtag = cg->succ.tag;
-			STARPU_ASSERT(cgtag);
-			_starpu_spin_lock(&cgtag->lock);
-		}
-
 		_starpu_notify_cg(pred, cg);
 		_starpu_notify_cg(pred, cg);
-
-		if (cg_type == STARPU_CG_TAG)
-			_starpu_spin_unlock(&cgtag->lock);
-
 		_starpu_spin_lock(&successors->lock);
 		_starpu_spin_lock(&successors->lock);
 	}
 	}
 	successors->terminated = 1;
 	successors->terminated = 1;

+ 31 - 9
src/core/dependencies/data_arbiter_concurrency.c

@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 }
 }
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 #else // LOCK_OR_DELEGATE
 #else // LOCK_OR_DELEGATE
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 #endif
 #endif
 {
 {
 	starpu_arbiter_t arbiter = handle->arbiter;
 	starpu_arbiter_t arbiter = handle->arbiter;
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 	{
 	{
 		/* No waiter, just remove our reference */
 		/* No waiter, just remove our reference */
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
-		STARPU_ASSERT(handle->refcnt > 0);
+		if (down_to_mode == STARPU_NONE)
-		handle->refcnt--;
+		{
-		STARPU_ASSERT(handle->busy_count > 0);
+			STARPU_ASSERT(handle->refcnt > 0);
-		handle->busy_count--;
+			handle->refcnt--;
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+		}
+		else
+		{
+			/* Downgrade from W or RW down to R, keeping the same reference,
+			 * but thus allowing other readers without allowing writers.  */
+			STARPU_ASSERT(down_to_mode == STARPU_R &&
+				      handle->current_mode == STARPU_W);
+			handle->current_mode = down_to_mode;
+		}
 #ifndef LOCK_OR_DELEGATE
 #ifndef LOCK_OR_DELEGATE
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 #endif
 #endif
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
 
 	/* There is a waiter, remove our reference */
 	/* There is a waiter, remove our reference */
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	STARPU_ASSERT(handle->refcnt > 0);
+	if (down_to_mode == STARPU_NONE)
-	handle->refcnt--;
+	{
-	STARPU_ASSERT(handle->busy_count > 0);
+		STARPU_ASSERT(handle->refcnt > 0);
-	handle->busy_count--;
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+			      handle->current_mode == STARPU_W);
+		handle->current_mode = down_to_mode;
+	}
 	/* There should be at least one busy_count reference for the waiter
 	/* There should be at least one busy_count reference for the waiter
 	 * (thus we don't risk to see the handle disappear below) */
 	 * (thus we don't risk to see the handle disappear below) */
 	STARPU_ASSERT(handle->busy_count > 0);
 	STARPU_ASSERT(handle->busy_count > 0);

+ 29 - 11
src/core/dependencies/data_concurrency.c

@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
  * This may free the handle if it was lazily unregistered (1 is returned in
  * This may free the handle if it was lazily unregistered (1 is returned in
  * that case). The handle pointer thus becomes invalid for the caller.
  * that case). The handle pointer thus becomes invalid for the caller.
  */
  */
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 {
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 	_starpu_spin_checklocked(&handle->header_lock);
 
 
+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
+	{
+		/* No change, nothing to do */
+		return 0;
+	}
+
 	if (handle->arbiter)
 	if (handle->arbiter)
 	{
 	{
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
-		_starpu_notify_arbitered_dependencies(handle);
+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
 		/* We have already unlocked */
 		/* We have already unlocked */
 		return 1;
 		return 1;
 	}
 	}
 
 
-	/* A data access has finished so we remove a reference. */
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
-	if (_starpu_data_check_not_busy(handle))
-		/* Handle was destroyed, nothing left to do.  */
-		return 1;
-
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 
 
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* A data access has finished so we remove a reference. */
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (_starpu_data_check_not_busy(handle))
+			/* Handle was destroyed, nothing left to do.  */
+			return 1;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+				(handle->current_mode == STARPU_RW ||
+				 handle->current_mode == STARPU_W));
+		handle->current_mode = down_to_mode;
+	}
+
 	/* In case there is a pending reduction, and that this is the last
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
 	 * requester, we may go back to a "normal" coherency model. */
 	if (handle->reduction_refcnt > 0)
 	if (handle->reduction_refcnt > 0)

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
 
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
 
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_data_access_mode mode,
 							  enum starpu_data_access_mode mode,

+ 10 - 3
src/core/dependencies/implicit_data_deps.c

@@ -616,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 }
 }
 
 
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
 	unsigned do_submit_tasks = 0;
+	unsigned last_cnt;
 
 
 	/* Here helgrind would shout that this is an unprotected access, but
 	/* Here helgrind would shout that this is an unprotected access, but
 	 * count can only be zero if we don't have to care about
 	 * count can only be zero if we don't have to care about
 	 * post_sync_tasks_cnt at all.  */
 	 * post_sync_tasks_cnt at all.  */
-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
+	if (handle->post_sync_tasks_cnt)
 	{
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		if (--handle->post_sync_tasks_cnt == 0)
+		last_cnt = handle->post_sync_tasks_cnt;
+
+		if (mode == STARPU_NONE)
+			/* Last release from us */
+			handle->post_sync_tasks_cnt--;
+
+		if (last_cnt == 1)
 		{
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			do_submit_tasks = 1;
 			do_submit_tasks = 1;

+ 1 - 1
src/core/dependencies/implicit_data_deps.h

@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 
 /** Register a hook to be called when a write is submitted */
 /** Register a hook to be called when a write is submitted */
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));

+ 16 - 11
src/core/dependencies/tags.c

@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 
 
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
+			{
 				/* Last tag this cg depends on, cg becomes unreferenced */
 				/* Last tag this cg depends on, cg becomes unreferenced */
+#ifdef STARPU_DEBUG
+				free(cg->deps);
+				free(cg->done);
+#endif
 				free(cg);
 				free(cg);
+			}
 		}
 		}
 
 
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 		free(tag->tag_successors.succ);
 		free(tag->tag_successors.succ);
 #endif
 #endif
+#ifdef STARPU_DEBUG
+		free(tag->tag_successors.deps);
+		free(tag->tag_successors.done);
+#endif
 
 
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 	return tag;
 	return tag;
 }
 }
 
 
-/* lock should be taken */
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 {
 {
 	/* mark this tag as ready to run */
 	/* mark this tag as ready to run */
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* declare it to the scheduler ! */
 	/* declare it to the scheduler ! */
 	struct _starpu_job *j = tag->job;
 	struct _starpu_job *j = tag->job;
 
 
+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
+
 	/* In case the task job is going to be scheduled immediately, and if
 	/* In case the task job is going to be scheduled immediately, and if
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the dependencies of the task, and therefore it would try to grab the
 	 * the dependencies of the task, and therefore it would try to grab the
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* enforce data dependencies */
 	/* enforce data dependencies */
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	_starpu_enforce_deps_starting_from_task(j);
 	_starpu_enforce_deps_starting_from_task(j);
-
-	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
 }
 }
 
 
-/* the lock must be taken ! */
+/* the lock of the tag must already be taken ! */
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(tag);
 	STARPU_ASSERT(tag);
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
 }
 }
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
 	va_end(pa);
 	va_end(pa);

+ 2 - 0
src/core/dependencies/tags.h

@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 
 
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
+
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);

+ 1 - 1
src/core/jobs.c

@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 
 	job->task = task;
 	job->task = task;
 
 
-#ifndef STARPU_USE_FXT
+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 #endif
 #endif
 	{
 	{

+ 13 - 10
src/datawizard/coherency.c

@@ -855,7 +855,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
 
 /* in case the data was accessed on a write mode, do not forget to
 /* in case the data was accessed on a write mode, do not forget to
  * make it accessible again once it is possible ! */
  * make it accessible again once it is possible ! */
-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 {
 	uint32_t wt_mask;
 	uint32_t wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
@@ -880,14 +880,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 
 
-	/* Release refcnt taken by fetch_data_on_node */
+	if (down_to_mode == STARPU_NONE)
-	replicate->refcnt--;
+	{
-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
+		/* Release refcnt taken by fetch_data_on_node */
+		replicate->refcnt--;
+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
 
 
-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
-	handle->busy_count--;
+		handle->busy_count--;
+	}
 
 
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
@@ -1222,7 +1225,7 @@ enomem:
 
 
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
-		_starpu_release_data_on_node(handle, 0, local_replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 	}
 	}
 
 
 	return -1;
 	return -1;
@@ -1331,13 +1334,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 		if (node == -1)
 		if (node == -1)
 		{
 		{
 			/* NOWHERE case, just notify dependencies */
 			/* NOWHERE case, just notify dependencies */
-			if (!_starpu_notify_data_dependencies(handle))
+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 				_starpu_spin_unlock(&handle->header_lock);
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 		}
 		else
 		else
 		{
 		{
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
-			_starpu_release_data_on_node(handle, 0, local_replicate);
+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 		}
 		}
 	}
 	}
 
 

+ 1 - 0
src/datawizard/coherency.h

@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
+				  enum starpu_data_access_mode down_to_mode,
 				  struct _starpu_data_replicate *replicate);
 				  struct _starpu_data_replicate *replicate);
 
 
 void _starpu_update_data_state(starpu_data_handle_t handle,
 void _starpu_update_data_state(starpu_data_handle_t handle,

+ 4 - 4
src/datawizard/copy_driver.c

@@ -373,8 +373,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 
 
-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
+	if (ld2_src == blocksize * numblocks_1 &&
-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+	    ld2_dst == blocksize * numblocks_1)
 		/* Optimize contiguous case */
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
 					     dst, dst_offset, dst_node,
@@ -423,8 +423,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 
 
-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
 		/* Optimize contiguous case */
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
 					     dst, dst_offset, dst_node,

+ 2 - 2
src/datawizard/interfaces/data_interface.c

@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	{
 	{
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
 	}
 	else
 	else
 	{
 	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 }
 }

+ 28 - 8
src/datawizard/user_interactions.c

@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
 
 /* This function must be called after starpu_data_acquire so that the
 /* This function must be called after starpu_data_acquire so that the
  * application release the data */
  * application release the data */
-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 
 
+	if (mode == STARPU_RW)
+		/* They are equivalent here, and current_mode is never STARPU_RW */
+		mode = STARPU_W;
+
+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
+			  mode == handle->current_mode ||
+			  (mode == STARPU_R &&
+			     handle->current_mode == STARPU_W),
+		"We only support releasing from W to R");
+
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
-	_starpu_unlock_post_sync_tasks(handle);
+	_starpu_unlock_post_sync_tasks(handle, mode);
 
 
 	/* The application can now release the rw-lock */
 	/* The application can now release the rw-lock */
 	if (node >= 0)
 	if (node >= 0)
-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
 	else
 	else
 	{
 	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 				handle->per_node[i].refcnt--;
 				handle->per_node[i].refcnt--;
 		}
 		}
 		handle->busy_count--;
 		handle->busy_count--;
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, mode))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 }
 }
 
 
-void starpu_data_release(starpu_data_handle_t handle)
+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+{
+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
+}
+
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
 	int home_node = handle->home_node;
 	int home_node = handle->home_node;
 	if (home_node < 0)
 	if (home_node < 0)
 		home_node = STARPU_MAIN_RAM;
 		home_node = STARPU_MAIN_RAM;
-	starpu_data_release_on_node(handle, home_node);
+	starpu_data_release_to_on_node(handle, mode, home_node);
+}
+
+void starpu_data_release(starpu_data_handle_t handle)
+{
+	starpu_data_release_to(handle, STARPU_NONE);
 }
 }
 
 
 static void _prefetch_data_on_node(void *arg)
 static void _prefetch_data_on_node(void *arg)
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 		_starpu_data_acquire_wrapper_finished(wrapper);
 		_starpu_data_acquire_wrapper_finished(wrapper);
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* In case there was a temporary handle (eg. used for reduction), this
 		/* In case there was a temporary handle (eg. used for reduction), this
 		 * handle may have requested to be destroyed when the data is released
 		 * handle may have requested to be destroyed when the data is released
 		 * */
 		 * */
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 	else if (!async)
 	else if (!async)

+ 1 - 1
src/datawizard/write_back.c

@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 

+ 2 - 2
src/debug/latency.c

@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node0, replicate_0);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
 
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 		handle->refcnt++;
 		handle->refcnt++;
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node1, replicate_1);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}
 	}
 }
 }

+ 4 - 2
src/debug/traces/starpu_fxt.c

@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
 #else
 #else
-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
 #endif
 #endif
 		}
 		}
 
 

+ 1 - 0
src/debug/traces/starpu_fxt.h

@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_MemoryEvent;
 extern int _starpu_poti_MemoryEvent;
+extern int _starpu_poti_CommLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 
 
 /*
 /*

+ 4 - 2
src/debug/traces/starpu_fxt_mpi.c

@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
 
 
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 #else
 #else
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 #endif
 #endif
 
 

+ 7 - 4
src/debug/traces/starpu_paje.c

@@ -28,6 +28,7 @@
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_MemoryEvent = -1;
 int _starpu_poti_MemoryEvent = -1;
+int _starpu_poti_CommLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 #endif
 #endif
 #endif
 #endif
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 						     "SubmitOrder string"
 						     "SubmitOrder string"
 						     );
 						     );
 #ifdef HAVE_POTI_USER_NEWEVENT
 #ifdef HAVE_POTI_USER_NEWEVENT
+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
 	if (options->memory_states)
 	if (options->memory_states)
 	{
 	{
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 							     "Size string",
 							     "Size string",
 							     "Dest string");
 							     "Dest string");
 	}
 	}
-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
 #endif
 #endif
 #else
 #else
 	poti_header(1,1);
 	poti_header(1,1);
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%	Value	string\n");
-	fprintf(file, "%%	EndContainer	string\n");
+	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	Key	string\n");
-	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif
 #endif
 
 

+ 12 - 0
tests/Makefile.am

@@ -268,6 +268,7 @@ myPROGRAMS +=				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
 	datawizard/acquire_release2		\
+	datawizard/acquire_release_to		\
 	datawizard/acquire_try			\
 	datawizard/acquire_try			\
 	datawizard/bcsr				\
 	datawizard/bcsr				\
 	datawizard/cache			\
 	datawizard/cache			\
@@ -494,6 +495,17 @@ datawizard_acquire_release2_SOURCES +=		\
 	datawizard/acquire_release_opencl.c
 	datawizard/acquire_release_opencl.c
 endif
 endif
 
 
+datawizard_acquire_release_to_SOURCES =		\
+	datawizard/acquire_release_to.c
+if STARPU_USE_CUDA
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_opencl.c
+endif
+
 datawizard_scratch_SOURCES =			\
 datawizard_scratch_SOURCES =			\
 	datawizard/scratch.c
 	datawizard/scratch.c
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA

+ 214 - 0
tests/datawizard/acquire_release_to.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Check that _release_to correctly interacts with tasks working on the same data
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 10;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned ntasks = 1000;
+#else
+static unsigned ntasks = 10000;
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void increment_opencl(void *buffers[], void *args);
+#endif
+
+void increment_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {increment_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+void check_cpu(void *descr[], void *arg)
+{
+	unsigned *val = arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	STARPU_ASSERT(*tokenptr == *val);
+}
+
+static struct starpu_codelet check_cl =
+{
+	.modes = { STARPU_R },
+	.cpu_funcs = {check_cpu},
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+unsigned token = 0;
+starpu_data_handle_t token_handle;
+
+static
+int increment_token(void)
+{
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+int check_token(unsigned value)
+{
+	unsigned *value_p;
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &check_cl;
+	task->handles[0] = token_handle;
+	task->cl_arg = value_p = malloc(sizeof(*value_p));
+	task->cl_arg_size = sizeof(*value_p);
+	task->cl_arg_free = 1;
+	*value_p = value;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+void callback(void *arg)
+{
+	(void)arg;
+	token++;
+	starpu_data_release_to(token_handle, STARPU_W);
+	starpu_sleep(0.001);
+	starpu_data_release_to(token_handle, STARPU_R);
+	starpu_sleep(0.001);
+	starpu_data_release(token_handle);
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+int main(int argc, char **argv)
+{
+	unsigned i;
+	int ret;
+
+        ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
+
+        FPRINTF(stderr, "Token: %u\n", token);
+
+	for(i=0; i<ntasks; i++)
+	{
+		/* synchronize data in RAM */
+                ret = starpu_data_acquire(token_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+                token ++;
+
+		ret = check_token(4*i+1);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+2);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_W);
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_R);
+
+		starpu_sleep(0.001);
+		starpu_data_release(token_handle);
+
+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+		ret = check_token(4*i+3);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+4);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	}
+
+	starpu_data_unregister(token_handle);
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks * 4)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	return ret;
+
+enodev_release:
+	starpu_data_release(token_handle);
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 3 - 3
tests/datawizard/interfaces/block/block_opencl.c

@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 	}
 	}
 			
 			
 	{
 	{
-		size_t global = nx * ny * nz;
+		size_t global[3] = {nx, ny, nz};
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     NULL,
 					     0,
 					     0,
 					     NULL,
 					     NULL,

+ 15 - 23
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 			   int ldy, int ldz,
 			   int ldy, int ldz,
 			   int factor, __global int *err)
 			   int factor, __global int *err)
 {
 {
-        const int id = get_global_id(0);
+	const int idx = get_global_id(0);
-	if (id > 0)
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
 		return;
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	int val = idz*ny*nx+idy*nx+idx;
+	int i = (idz*ldz)+(idy*ldy)+idx;
 
 
-	unsigned int i, j, k;
+	if (block[i] != factor * val)
-	int val = 0;
+		*err = 1;
-	for (k = 0; k < nz; k++)
+	else
-	{
+		block[i] *= -1;
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					block[(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	}
 }
 }

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_interface.c

@@ -18,7 +18,7 @@
 #include "../test_interfaces.h"
 #include "../test_interfaces.h"
 #include "../../../helper.h"
 #include "../../../helper.h"
 
 
-#define NX 16
+#define NX 4
 #define NY NX
 #define NY NX
 #define NZ NX
 #define NZ NX
 #define NT NX
 #define NT NX

+ 3 - 3
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 	}
 			
 			
 	{
 	{
-                size_t global = 1;
+		size_t global[3] = {nx, ny, nz*nt};
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     NULL,
 					     0,
 					     0,
 					     NULL,
 					     NULL,

+ 18 - 26
tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl

@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 			   int ldy, int ldz, int ldt,
 			   int ldy, int ldz, int ldt,
 			   int factor, __global int *err)
 			   int factor, __global int *err)
 {
 {
-        const int id = get_global_id(0);
+	const int idx = get_global_id(0);
-	if (id > 0)
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2) % nz;
+	const int idt = get_global_id(2) / nz;
+	if (idx >= nx)
 		return;
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+	if (idt >= nt)
+		return;
+
+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
 
 
-	unsigned int i, j, k, l;
+	if (tensor[i] != factor * val)
-	int val = 0;
+		*err = 1;
-	for (l = 0; l < nt; l++)
+	else
-	{
+		tensor[i] *= -1;
-	    for (k = 0; k < nz; k++)
-	    {
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	    }
-	}
 }
 }

+ 8 - 0
tests/energy/energy_efficiency.c

@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
 
 	unsigned N, k, m, n, iter, NITER;
 	unsigned N, k, m, n, iter, NITER;
 	if (argc < 2)
 	if (argc < 2)
+#ifdef STARPU_QUICK_CHECK
+		N = 10;
+#else
 		N = 40;
 		N = 40;
+#endif
 	else
 	else
 		N = atoi(argv[1]);
 		N = atoi(argv[1]);
 	if (argc < 3)
 	if (argc < 3)
+#ifdef STARPU_QUICK_CHECK
+		NITER = 3;
+#else
 		NITER = 10;
 		NITER = 10;
+#endif
 	else
 	else
 		NITER = atoi(argv[2]);
 		NITER = atoi(argv[2]);
 	if (N == 0)
 	if (N == 0)

+ 1 - 0
tests/overlap/overlap.c

@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 	if (!finished)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 95 - 65
tools/gdbinit

@@ -39,33 +39,6 @@ define starpu-print-task
   set $task = (struct starpu_task *)$arg0
   set $task = (struct starpu_task *)$arg0
   set $job = (struct _starpu_job *)$task->starpu_private
   set $job = (struct _starpu_job *)$task->starpu_private
   set $status=0
   set $status=0
-  if $task->status == 0
-    set $status="STARPU_TASK_INIT"
-  end
-  if $task->status == 1
-    set $status="STARPU_TASK_BLOCKED"
-  end
-  if $task->status == 2
-    set $status="STARPU_TASK_READY"
-  end
-  if $task->status == 3
-    set $status="STARPU_TASK_RUNNING"
-  end
-  if $task->status == 4
-    set $status="STARPU_TASK_FINISHED"
-  end
-  if $task->status == 5
-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
-  end
-  if $task->status == 6
-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
-  end
-  if $task->status == 7
-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
-  end
-  if $task->status == 8
-    set $status="STARPU_TASK_STOPPED"
-  end
 
 
   printf "StarPU Task (%p)\n", $task
   printf "StarPU Task (%p)\n", $task
   if $task->name
   if $task->name
@@ -81,13 +54,42 @@ define starpu-print-task
   end
   end
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
-  printf "\tstatus:\t\t\t\t<%s>\n", $status
+  printf "\tstatus:\t\t\t\t"
+  if $task->status == 0
+    printf "STARPU_TASK_INIT"
+  end
+  if $task->status == 1
+    printf "STARPU_TASK_BLOCKED"
+  end
+  if $task->status == 2
+    printf "STARPU_TASK_READY"
+  end
+  if $task->status == 3
+    printf "STARPU_TASK_RUNNING"
+  end
+  if $task->status == 4
+    printf "STARPU_TASK_FINISHED"
+  end
+  if $task->status == 5
+    printf "STARPU_TASK_BLOCKED_ON_TAG"
+  end
+  if $task->status == 6
+    printf "STARPU_TASK_BLOCKED_ON_TASK"
+  end
+  if $task->status == 7
+    printf "STARPU_TASK_BLOCKED_ON_DATA"
+  end
+  if $task->status == 8
+    printf "STARPU_TASK_STOPPED"
+  end
+  printf "\n"
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
@@ -169,35 +171,36 @@ define starpu-workers
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   while $num<_starpu_config->topology->nworkers
   while $num<_starpu_config->topology->nworkers
     set $worker=&_starpu_config->workers[$num]
     set $worker=&_starpu_config->workers[$num]
+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
     if $worker->status == STATUS_INVALID
     if $worker->status == STATUS_INVALID
-      set $status="INVALID"
+      printf "INVALID"
     end
     end
     if $worker->status == STATUS_UNKNOWN
     if $worker->status == STATUS_UNKNOWN
-      set $status="UNKNOWN"
+      printf "UNKNOWN"
     end
     end
     if $worker->status == STATUS_INITIALIZING
     if $worker->status == STATUS_INITIALIZING
-      set $status="INITIALIZING"
+      printf "INITIALIZING"
     end
     end
     if $worker->status == STATUS_EXECUTING
     if $worker->status == STATUS_EXECUTING
-      set $status="EXECUTING"
+      printf "EXECUTING"
     end
     end
     if $worker->status == STATUS_CALLBACK
     if $worker->status == STATUS_CALLBACK
-      set $status="CALLBACK"
+      printf "CALLBACK"
     end
     end
     if $worker->status == STATUS_SCHEDULING
     if $worker->status == STATUS_SCHEDULING
-      set $status="SCHEDULING"
+      printf "SCHEDULING"
     end
     end
     if $worker->status == STATUS_WAITING
     if $worker->status == STATUS_WAITING
-      set $status="WAITING"
+      printf "WAITING"
     end
     end
     if $worker->status == STATUS_SLEEPING_SCHEDULING
     if $worker->status == STATUS_SLEEPING_SCHEDULING
-      set $status="SLEEPING_SCHEDULING"
+      printf "SLEEPING_SCHEDULING"
     end
     end
     if $worker->status == STATUS_SLEEPING
     if $worker->status == STATUS_SLEEPING
-      set $status="SLEEPING"
+      printf "SLEEPING"
     end
     end
-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+    printf "\n"
-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
     set $num = $num + 1
     set $num = $num + 1
   end
   end
 end
 end
@@ -205,23 +208,24 @@ end
 define starpu-print-tag
 define starpu-print-tag
   set language c
   set language c
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\tstate "
   if $tag_struct->state == STARPU_INVALID_STATE
   if $tag_struct->state == STARPU_INVALID_STATE
-     set $status="STARPU_INVALID_STATE"
+     printf "STARPU_INVALID_STATE"
   end
   end
   if $tag_struct->state == STARPU_ASSOCIATED
   if $tag_struct->state == STARPU_ASSOCIATED
-     set $status="STARPU_ASSOCIATED"
+     printf "STARPU_ASSOCIATED"
   end
   end
   if $tag_struct->state == STARPU_BLOCKED
   if $tag_struct->state == STARPU_BLOCKED
-     set $status="STARPU_BLOCKED"
+     printf "STARPU_BLOCKED"
   end
   end
   if $tag_struct->state == STARPU_READY
   if $tag_struct->state == STARPU_READY
-     set $status="STARPU_READY"
+     printf "STARPU_READY"
   end
   end
   if $tag_struct->state == STARPU_DONE
   if $tag_struct->state == STARPU_DONE
-     set $status="STARPU_DONE"
+     printf "STARPU_DONE"
   end
   end
-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\n"
-  printf "\tstate %s\n", $status
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
@@ -317,7 +321,11 @@ define starpu-all-tasks
     while $l != &all_jobs_list
     while $l != &all_jobs_list
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $task = $j->task
       set $task = $j->task
-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
+      if $task->name
+        printf "task %p %s\n", $task, $task->name
+      else
+        printf "task %p\n", $task
+      end
       set $l = $l->next
       set $l = $l->next
     end
     end
   end
   end
@@ -915,9 +923,9 @@ end
 define starpu-sched-print-component
 define starpu-sched-print-component
     set $c = (struct starpu_sched_component *) $arg1
     set $c = (struct starpu_sched_component *) $arg1
     starpu-print-spaces $arg0
     starpu-print-spaces $arg0
-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
     if $c->push_task == fifo_push_task
     if $c->push_task == fifo_push_task
-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
       starpu-print-spaces $arg0
       starpu-print-spaces $arg0
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
     end
     end
@@ -951,29 +959,29 @@ end
 
 
 define starpu-mpi-print-request
 define starpu-mpi-print-request
     set $request = (struct _starpu_mpi_req *)$arg0
     set $request = (struct _starpu_mpi_req *)$arg0
-    set $request_type = "unknown_type"
+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
     if $request->request_type == SEND_REQ
     if $request->request_type == SEND_REQ
-       set $request_type = "SEND_REQ"
+       printf "SEND_REQ"
     end
     end
     if $request->request_type == RECV_REQ
     if $request->request_type == RECV_REQ
-       set $request_type = "RECV_REQ"
+       printf "RECV_REQ"
     end
     end
     if $request->request_type == WAIT_REQ
     if $request->request_type == WAIT_REQ
-       set $request_type = "WAIT_REQ"
+       printf "WAIT_REQ"
     end
     end
     if $request->request_type == TEST_REQ
     if $request->request_type == TEST_REQ
-       set $request_type = "TEST_REQ"
+       printf "TEST_REQ"
     end
     end
     if $request->request_type == BARRIER_REQ
     if $request->request_type == BARRIER_REQ
-       set $request_type = "BARRIER_REQ"
+       printf "BARRIER_REQ"
     end
     end
     if $request->request_type == PROBE_REQ
     if $request->request_type == PROBE_REQ
-       set $request_type = "PROBE_REQ"
+       printf "PROBE_REQ"
     end
     end
     if $request->request_type == UNKNOWN_REQ
     if $request->request_type == UNKNOWN_REQ
-       set $request_type = "UNKNOWN_REQ"
+       printf "UNKNOWN_REQ"
     end
     end
-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
 end
 end
 
 
 define starpu-mpi-print-ready-recv-requests
 define starpu-mpi-print-ready-recv-requests
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
     end
     end
 end
 end
 
 
+define starpu-mpi-print-requests-list
+  set $list = $arg0
+  set $request = $list->_head
+  while $request
+    starpu-mpi-print-request $request
+    set $request = $request->_next
+  end
+end
+
+define starpu-mpi-print-requests-tree
+  if $arg0
+    starpu-mpi-print-requests-tree $arg0->children[0]
+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
+    starpu-mpi-print-requests-list (&($stage->list))
+    starpu-mpi-print-requests-tree $arg0->children[1]
+  end
+end
+
 define starpu-mpi-print-ready-send-requests
 define starpu-mpi-print-ready-send-requests
-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
-    if $list
+  if _starpu_debug
-	set $request = $list.list._head
+    if $prio_list
-        while $request
+        starpu-mpi-print-requests-list &$prio_list.list
-            starpu-mpi-print-request $request
+    else
-	    set $request = $request->_next
+	printf "No ready send requests\n"
-	end
+    end
+  else
+    if $prio_list.empty == 0
+        starpu-mpi-print-requests-tree $prio_list.tree.root
     else
     else
 	printf "No ready send requests\n"
 	printf "No ready send requests\n"
     end
     end
+  end
 end
 end
 
 
 define starpu-mpi-print-detached-requests
 define starpu-mpi-print-detached-requests