Browse Source

Merge remote-tracking branch 'gitlab/master' into ft_checkpoint

Romain LION 4 years ago
parent
commit
324cffe7ad
53 changed files with 700 additions and 231 deletions
  1. 1 0
      ChangeLog
  2. 2 0
      configure.ac
  3. 1 0
      examples/Makefile.am
  4. 11 0
      examples/api/bcsr_data_interface.c
  5. 9 0
      examples/api/block_data_interface.c
  6. 2 0
      examples/api/coo_data_interface.c
  7. 8 0
      examples/api/csr_data_interface.c
  8. 10 0
      examples/api/matrix_data_interface.c
  9. 2 0
      examples/api/multiformat_data_interface.c
  10. 37 0
      examples/api/tensor_data_interface.c
  11. 5 0
      examples/api/variable_data_interface.c
  12. 8 0
      examples/api/vector_data_interface.c
  13. 2 0
      examples/api/void_data_interface.c
  14. 5 0
      examples/filters/fblock.c
  15. 2 2
      examples/filters/fblock_opencl.c
  16. 13 10
      examples/filters/fblock_opencl_kernel.cl
  17. 19 1
      include/starpu_data.h
  18. 3 1
      mpi/examples/Makefile.am
  19. 11 0
      mpi/examples/mpi_lu/plu_example.c
  20. 11 0
      mpi/examples/mpi_lu/plu_implicit_example.c
  21. 2 4
      mpi/src/mpi/starpu_mpi_mpi.c
  22. 2 2
      mpi/src/starpu_mpi_coop_sends.c
  23. 1 1
      mpi/src/starpu_mpi_fxt.h
  24. 4 16
      src/core/dependencies/cg.c
  25. 31 9
      src/core/dependencies/data_arbiter_concurrency.c
  26. 29 11
      src/core/dependencies/data_concurrency.c
  27. 2 2
      src/core/dependencies/data_concurrency.h
  28. 10 3
      src/core/dependencies/implicit_data_deps.c
  29. 1 1
      src/core/dependencies/implicit_data_deps.h
  30. 16 11
      src/core/dependencies/tags.c
  31. 2 0
      src/core/dependencies/tags.h
  32. 1 1
      src/core/jobs.c
  33. 13 10
      src/datawizard/coherency.c
  34. 1 0
      src/datawizard/coherency.h
  35. 4 4
      src/datawizard/copy_driver.c
  36. 2 2
      src/datawizard/interfaces/data_interface.c
  37. 28 8
      src/datawizard/user_interactions.c
  38. 1 1
      src/datawizard/write_back.c
  39. 2 2
      src/debug/latency.c
  40. 4 2
      src/debug/traces/starpu_fxt.c
  41. 1 0
      src/debug/traces/starpu_fxt.h
  42. 4 2
      src/debug/traces/starpu_fxt_mpi.c
  43. 7 4
      src/debug/traces/starpu_paje.c
  44. 12 0
      tests/Makefile.am
  45. 214 0
      tests/datawizard/acquire_release_to.c
  46. 3 3
      tests/datawizard/interfaces/block/block_opencl.c
  47. 15 23
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  48. 1 1
      tests/datawizard/interfaces/tensor/tensor_interface.c
  49. 3 3
      tests/datawizard/interfaces/tensor/tensor_opencl.c
  50. 18 26
      tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
  51. 8 0
      tests/energy/energy_efficiency.c
  52. 1 0
      tests/overlap/overlap.c
  53. 95 65
      tools/gdbinit

+ 1 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
 
 Small changes:
   * Add a synthetic energy efficiency testcase.

+ 2 - 0
configure.ac

@@ -151,6 +151,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 	], [simgrid_lib_dir=no])
 
 if test x$enable_simgrid = xyes ; then
+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
+
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"

+ 1 - 0
examples/Makefile.am

@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 	api/csr_data_interface			\
 	api/matrix_data_interface		\
 	api/multiformat_data_interface		\
+	api/tensor_data_interface		\
 	api/variable_data_interface		\
 	api/vector_data_interface		\
 	api/void_data_interface

+ 11 - 0
examples/api/bcsr_data_interface.c

@@ -17,6 +17,17 @@
 // This program checks that the implementation of the BCSR data
 // interface only uses StarPU's public API
 
+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 
 int main()

+ 9 - 0
examples/api/block_data_interface.c

@@ -17,6 +17,15 @@
 // This program checks that the implementation of the block data
 // interface only uses StarPU's public API
 
+#define starpu_interface_block_ops my_starpu_interface_block_ops
+#define starpu_block_data_register my_starpu_block_data_register
+#define starpu_block_get_nx my_starpu_block_get_nx
+#define starpu_block_get_ny my_starpu_block_get_ny
+#define starpu_block_get_nz my_starpu_block_get_nz
+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
 #include "../../src/datawizard/interfaces/block_interface.c"
 
 int main()

+ 2 - 0
examples/api/coo_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the COO data
 // interface only uses StarPU's public API
 
+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
+#define starpu_coo_data_register my_starpu_coo_data_register
 #include "../../src/datawizard/interfaces/coo_interface.c"
 
 int main()

+ 8 - 0
examples/api/csr_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the CSR data
 // interface only uses StarPU's public API
 
+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
+#define starpu_csr_data_register my_starpu_csr_data_register
+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
 #include "../../src/datawizard/interfaces/csr_interface.c"
 
 int main()

+ 10 - 0
examples/api/matrix_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the matrix data
 // interface only uses StarPU's public API
 
+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
+#define starpu_matrix_data_register my_starpu_matrix_data_register
+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 
 int main()

+ 2 - 0
examples/api/multiformat_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the multiformat data
 // interface only uses StarPU's public API
 
+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 
 int main()

+ 37 - 0
examples/api/tensor_data_interface.c

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+// This program checks that the implementation of the tensor data
+// interface only uses StarPU's public API
+
+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
+#define starpu_tensor_data_register my_starpu_tensor_data_register
+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
+#include "../../src/datawizard/interfaces/tensor_interface.c"
+
+int main()
+{
+        return 0;
+}

+ 5 - 0
examples/api/variable_data_interface.c

@@ -17,6 +17,11 @@
 // This program checks that the implementation of the variable data
 // interface only uses StarPU's public API
 
+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
+#define starpu_variable_data_register my_starpu_variable_data_register
+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
 #include "../../src/datawizard/interfaces/variable_interface.c"
 
 int main()

+ 8 - 0
examples/api/vector_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the vector data
 // interface only uses StarPU's public API
 
+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
+#define starpu_vector_data_register my_starpu_vector_data_register
+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
+#define starpu_vector_get_nx my_starpu_vector_get_nx
+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
 #include "../../src/datawizard/interfaces/vector_interface.c"
 
 int main()

+ 2 - 0
examples/api/void_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the void data
 // interface only uses StarPU's public API
 
+#define starpu_interface_void_ops my_starpu_interface_void_ops
+#define starpu_void_data_register my_starpu_void_data_register
 #include "../../src/datawizard/interfaces/void_interface.c"
 
 int main()

+ 5 - 0
examples/filters/fblock.c

@@ -169,6 +169,11 @@ int main(void)
         print_data(handle);
         starpu_data_unregister(handle);
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
         /* Print result block */
         FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 	{
-		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		size_t global[3]={nx,ny,nz};
+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	starpu_opencl_release_kernel(kernel);

+ 13 - 10
examples/filters/fblock_opencl_kernel.cl

@@ -18,14 +18,17 @@
 
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
-        int i, j, k;
-        block = (__global char *)block + offset;
-        for(k=0; k<nz ; k++)
-	{
-                for(j=0; j<ny ; j++)
-		{
-                        for(i=0; i<nx ; i++)
-                                block[(k*ldz)+(j*ldy)+i] = factor;
-                }
-        }
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
+		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	block = (__global int*) ((__global char *)block + offset);
+	int i = idz*ldz + idy*ldy + idx;
+	block[i] = factor;
 }

+ 19 - 1
include/starpu_data.h

@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
 /**
    Similar to starpu_data_release(), except that the data
-   will be available on the given memory \p node instead of main memory.
+   was made available on the given memory \p node instead of main memory.
    The \p node parameter must be exactly the same as the corresponding \c
    starpu_data_acquire_on_node* call.
 */
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 
+/**
+   Partly release the piece of data acquired by the application either by
+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
+   case this is equivalent to calling starpu_data_release().
+*/
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+
+/**
+   Similar to starpu_data_release_to(), except that the data
+   was made available on the given memory \p node instead of main memory.
+   The \p node parameter must be exactly the same as the corresponding \c
+   starpu_data_acquire_on_node* call.
+*/
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
+
 /** @} */
 
 /**

+ 3 - 1
mpi/examples/Makefile.am

@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 endif
 
 mpi_lu_plu_example_float_LDADD =	\

+ 11 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 	int rank;
 	int world_size;
 	int ret;
+	unsigned i, j;
 
 	/*
 	 *	Initialization
@@ -594,6 +595,16 @@ int main(int argc, char **argv)
 	/*
 	 * 	Termination
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

+ 11 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 	int rank;
 	int world_size;
 	int ret;
+	unsigned i, j;
 
 	starpu_srand48((long int)time(NULL));
 
@@ -376,6 +377,16 @@ int main(int argc, char **argv)
 	/*
 	 * 	Termination
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
 
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();

+ 2 - 4
mpi/src/mpi/starpu_mpi_mpi.c

@@ -257,7 +257,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			/* Case: no matching data has been received. Store the receive request as an early_request. */
@@ -957,7 +957,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	}
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
-	starpu_data_release(args->early_handle);
+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
 
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	/* XXX: note that we have already freed the registered buffer above. In
@@ -1104,8 +1104,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
-		starpu_wake_all_blocked_workers();
-
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);

+ 2 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 			/* We were last, release data */
 			free(coop_sends->reqs_array);
 			free(coop_sends);
-			starpu_data_release(req->data_handle);
+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 		}
 	}
 	else
 	{
 		/* Trivial request */
-		starpu_data_release(req->data_handle);
+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 	}
 }
 

+ 1 - 1
mpi/src/starpu_mpi_fxt.h

@@ -74,7 +74,7 @@ extern "C"
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\

+ 4 - 16
src/core/dependencies/cg.c

@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 	return n;
 }
 
-/* Note: in case of a tag, it must be already locked */
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 {
 	STARPU_ASSERT(cg);
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				struct _starpu_tag *tag;
 
 				tag = cg->succ.tag;
+				_starpu_spin_lock(&tag->lock);
 				tag_successors = &tag->tag_successors;
 
 				tag_successors->ndeps_completed++;
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				{
 					/* reset the counter so that we can reuse the completion group */
 					tag_successors->ndeps_completed = 0;
+					/* This releases the lock */
 					_starpu_tag_set_ready(tag);
-				}
+				} else
+					_starpu_spin_unlock(&tag->lock);
 				break;
 			}
 
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 			successors->nsuccs--;
 		}
 		_starpu_spin_unlock(&successors->lock);
-
-		struct _starpu_tag *cgtag = NULL;
-
-		if (cg_type == STARPU_CG_TAG)
-		{
-			cgtag = cg->succ.tag;
-			STARPU_ASSERT(cgtag);
-			_starpu_spin_lock(&cgtag->lock);
-		}
-
 		_starpu_notify_cg(pred, cg);
-
-		if (cg_type == STARPU_CG_TAG)
-			_starpu_spin_unlock(&cgtag->lock);
-
 		_starpu_spin_lock(&successors->lock);
 	}
 	successors->terminated = 1;

+ 31 - 9
src/core/dependencies/data_arbiter_concurrency.c

@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 }
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 #else // LOCK_OR_DELEGATE
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 #endif
 {
 	starpu_arbiter_t arbiter = handle->arbiter;
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 	{
 		/* No waiter, just remove our reference */
 		_starpu_spin_lock(&handle->header_lock);
-		STARPU_ASSERT(handle->refcnt > 0);
-		handle->refcnt--;
-		STARPU_ASSERT(handle->busy_count > 0);
-		handle->busy_count--;
+		if (down_to_mode == STARPU_NONE)
+		{
+			STARPU_ASSERT(handle->refcnt > 0);
+			handle->refcnt--;
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+		}
+		else
+		{
+			/* Downgrade from W or RW down to R, keeping the same reference,
+			 * but thus allowing other readers without allowing writers.  */
+			STARPU_ASSERT(down_to_mode == STARPU_R &&
+				      handle->current_mode == STARPU_W);
+			handle->current_mode = down_to_mode;
+		}
 #ifndef LOCK_OR_DELEGATE
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 #endif
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
 	/* There is a waiter, remove our reference */
 	_starpu_spin_lock(&handle->header_lock);
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
+	if (down_to_mode == STARPU_NONE)
+	{
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+			      handle->current_mode == STARPU_W);
+		handle->current_mode = down_to_mode;
+	}
 	/* There should be at least one busy_count reference for the waiter
 	 * (thus we don't risk to see the handle disappear below) */
 	STARPU_ASSERT(handle->busy_count > 0);

+ 29 - 11
src/core/dependencies/data_concurrency.c

@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
  * This may free the handle if it was lazily unregistered (1 is returned in
  * that case). The handle pointer thus becomes invalid for the caller.
  */
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 
+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
+	{
+		/* No change, nothing to do */
+		return 0;
+	}
+
 	if (handle->arbiter)
 	{
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		_starpu_spin_unlock(&handle->header_lock);
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
-		_starpu_notify_arbitered_dependencies(handle);
+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
 		/* We have already unlocked */
 		return 1;
 	}
 
-	/* A data access has finished so we remove a reference. */
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
-	if (_starpu_data_check_not_busy(handle))
-		/* Handle was destroyed, nothing left to do.  */
-		return 1;
-
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* A data access has finished so we remove a reference. */
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (_starpu_data_check_not_busy(handle))
+			/* Handle was destroyed, nothing left to do.  */
+			return 1;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+				(handle->current_mode == STARPU_RW ||
+				 handle->current_mode == STARPU_W));
+		handle->current_mode = down_to_mode;
+	}
+
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
 	if (handle->reduction_refcnt > 0)

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_data_access_mode mode,

+ 10 - 3
src/core/dependencies/implicit_data_deps.c

@@ -616,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
         _STARPU_LOG_OUT();
 }
 
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
+	unsigned last_cnt;
 
 	/* Here helgrind would shout that this is an unprotected access, but
 	 * count can only be zero if we don't have to care about
 	 * post_sync_tasks_cnt at all.  */
-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
+	if (handle->post_sync_tasks_cnt)
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		if (--handle->post_sync_tasks_cnt == 0)
+		last_cnt = handle->post_sync_tasks_cnt;
+
+		if (mode == STARPU_NONE)
+			/* Last release from us */
+			handle->post_sync_tasks_cnt--;
+
+		if (last_cnt == 1)
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			do_submit_tasks = 1;

+ 1 - 1
src/core/dependencies/implicit_data_deps.h

@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 /** Register a hook to be called when a write is submitted */
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));

+ 16 - 11
src/core/dependencies/tags.c

@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
+			{
 				/* Last tag this cg depends on, cg becomes unreferenced */
+#ifdef STARPU_DEBUG
+				free(cg->deps);
+				free(cg->done);
+#endif
 				free(cg);
+			}
 		}
 
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 		free(tag->tag_successors.succ);
 #endif
+#ifdef STARPU_DEBUG
+		free(tag->tag_successors.deps);
+		free(tag->tag_successors.done);
+#endif
 
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 	return tag;
 }
 
-/* lock should be taken */
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 {
 	/* mark this tag as ready to run */
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* declare it to the scheduler ! */
 	struct _starpu_job *j = tag->job;
 
+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
+
 	/* In case the task job is going to be scheduled immediately, and if
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the dependencies of the task, and therefore it would try to grab the
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* enforce data dependencies */
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	_starpu_enforce_deps_starting_from_task(j);
-
-	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
 }
 
-/* the lock must be taken ! */
+/* the lock of the tag must already be taken ! */
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 {
 	STARPU_ASSERT(tag);
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 }
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	va_end(pa);

+ 2 - 0
src/core/dependencies/tags.h

@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
+
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);

+ 1 - 1
src/core/jobs.c

@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 	job->task = task;
 
-#ifndef STARPU_USE_FXT
+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 #endif
 	{

+ 13 - 10
src/datawizard/coherency.c

@@ -855,7 +855,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
 /* in case the data was accessed on a write mode, do not forget to
  * make it accessible again once it is possible ! */
-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 	uint32_t wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
@@ -880,14 +880,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 
-	/* Release refcnt taken by fetch_data_on_node */
-	replicate->refcnt--;
-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* Release refcnt taken by fetch_data_on_node */
+		replicate->refcnt--;
+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
 
-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
-	handle->busy_count--;
+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
+		handle->busy_count--;
+	}
 
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
@@ -1222,7 +1225,7 @@ enomem:
 
 		local_replicate = get_replicate(handle, mode, workerid, node);
 
-		_starpu_release_data_on_node(handle, 0, local_replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 	}
 
 	return -1;
@@ -1331,13 +1334,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 		if (node == -1)
 		{
 			/* NOWHERE case, just notify dependencies */
-			if (!_starpu_notify_data_dependencies(handle))
+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 		else
 		{
 			_starpu_spin_unlock(&handle->header_lock);
-			_starpu_release_data_on_node(handle, 0, local_replicate);
+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 		}
 	}
 

+ 1 - 0
src/datawizard/coherency.h

@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
+				  enum starpu_data_access_mode down_to_mode,
 				  struct _starpu_data_replicate *replicate);
 
 void _starpu_update_data_state(starpu_data_handle_t handle,

+ 4 - 4
src/datawizard/copy_driver.c

@@ -373,8 +373,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 
-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+	if (ld2_src == blocksize * numblocks_1 &&
+	    ld2_dst == blocksize * numblocks_1)
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
@@ -423,8 +423,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 
-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,

+ 2 - 2
src/datawizard/interfaces/data_interface.c

@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	{
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
 	else
 	{
 		_starpu_spin_lock(&handle->header_lock);
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 }

+ 28 - 8
src/datawizard/user_interactions.c

@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
 /* This function must be called after starpu_data_acquire so that the
  * application release the data */
-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
 {
 	STARPU_ASSERT(handle);
 
+	if (mode == STARPU_RW)
+		/* They are equivalent here, and current_mode is never STARPU_RW */
+		mode = STARPU_W;
+
+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
+			  mode == handle->current_mode ||
+			  (mode == STARPU_R &&
+			     handle->current_mode == STARPU_W),
+		"We only support releasing from W to R");
+
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
-	_starpu_unlock_post_sync_tasks(handle);
+	_starpu_unlock_post_sync_tasks(handle, mode);
 
 	/* The application can now release the rw-lock */
 	if (node >= 0)
-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
 	else
 	{
 		_starpu_spin_lock(&handle->header_lock);
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 				handle->per_node[i].refcnt--;
 		}
 		handle->busy_count--;
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, mode))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 }
 
-void starpu_data_release(starpu_data_handle_t handle)
+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+{
+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
+}
+
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	int home_node = handle->home_node;
 	if (home_node < 0)
 		home_node = STARPU_MAIN_RAM;
-	starpu_data_release_on_node(handle, home_node);
+	starpu_data_release_to_on_node(handle, mode, home_node);
+}
+
+void starpu_data_release(starpu_data_handle_t handle)
+{
+	starpu_data_release_to(handle, STARPU_NONE);
 }
 
 static void _prefetch_data_on_node(void *arg)
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 		_starpu_data_acquire_wrapper_finished(wrapper);
 
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* In case there was a temporary handle (eg. used for reduction), this
 		 * handle may have requested to be destroyed when the data is released
 		 * */
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	else if (!async)

+ 1 - 1
src/datawizard/write_back.c

@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 

+ 2 - 2
src/debug/latency.c

@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node0, replicate_0);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
 		_starpu_spin_lock(&handle->header_lock);
 		handle->refcnt++;
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node1, replicate_1);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}
 }

+ 4 - 2
src/debug/traces/starpu_fxt.c

@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
 #else
-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
 #endif
 		}
 

+ 1 - 0
src/debug/traces/starpu_fxt.h

@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_MemoryEvent;
+extern int _starpu_poti_CommLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 
 /*

+ 4 - 2
src/debug/traces/starpu_fxt_mpi.c

@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
 
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 #else
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 #endif
 

+ 7 - 4
src/debug/traces/starpu_paje.c

@@ -28,6 +28,7 @@
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_MemoryEvent = -1;
+int _starpu_poti_CommLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 #endif
 #endif
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 						     "SubmitOrder string"
 						     );
 #ifdef HAVE_POTI_USER_NEWEVENT
+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
 	if (options->memory_states)
 	{
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 							     "Size string",
 							     "Dest string");
 	}
-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
 #endif
 #else
 	poti_header(1,1);
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Value	string\n");
-	fprintf(file, "%%	EndContainer	string\n");
+	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
-	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif
 

+ 12 - 0
tests/Makefile.am

@@ -268,6 +268,7 @@ myPROGRAMS +=				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
+	datawizard/acquire_release_to		\
 	datawizard/acquire_try			\
 	datawizard/bcsr				\
 	datawizard/cache			\
@@ -494,6 +495,17 @@ datawizard_acquire_release2_SOURCES +=		\
 	datawizard/acquire_release_opencl.c
 endif
 
+datawizard_acquire_release_to_SOURCES =		\
+	datawizard/acquire_release_to.c
+if STARPU_USE_CUDA
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_opencl.c
+endif
+
 datawizard_scratch_SOURCES =			\
 	datawizard/scratch.c
 if STARPU_USE_CUDA

+ 214 - 0
tests/datawizard/acquire_release_to.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Check that _release_to correctly interacts with tasks working on the same data
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 10;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned ntasks = 1000;
+#else
+static unsigned ntasks = 10000;
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void increment_opencl(void *buffers[], void *args);
+#endif
+
+void increment_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {increment_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+void check_cpu(void *descr[], void *arg)
+{
+	unsigned *val = arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	STARPU_ASSERT(*tokenptr == *val);
+}
+
+static struct starpu_codelet check_cl =
+{
+	.modes = { STARPU_R },
+	.cpu_funcs = {check_cpu},
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+unsigned token = 0;
+starpu_data_handle_t token_handle;
+
+static
+int increment_token(void)
+{
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+int check_token(unsigned value)
+{
+	unsigned *value_p;
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &check_cl;
+	task->handles[0] = token_handle;
+	task->cl_arg = value_p = malloc(sizeof(*value_p));
+	task->cl_arg_size = sizeof(*value_p);
+	task->cl_arg_free = 1;
+	*value_p = value;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+void callback(void *arg)
+{
+	(void)arg;
+	token++;
+	starpu_data_release_to(token_handle, STARPU_W);
+	starpu_sleep(0.001);
+	starpu_data_release_to(token_handle, STARPU_R);
+	starpu_sleep(0.001);
+	starpu_data_release(token_handle);
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+int main(int argc, char **argv)
+{
+	unsigned i;
+	int ret;
+
+        ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
+
+        FPRINTF(stderr, "Token: %u\n", token);
+
+	for(i=0; i<ntasks; i++)
+	{
+		/* synchronize data in RAM */
+                ret = starpu_data_acquire(token_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+                token ++;
+
+		ret = check_token(4*i+1);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+2);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_W);
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_R);
+
+		starpu_sleep(0.001);
+		starpu_data_release(token_handle);
+
+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+		ret = check_token(4*i+3);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+4);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	}
+
+	starpu_data_unregister(token_handle);
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks * 4)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	return ret;
+
+enodev_release:
+	starpu_data_release(token_handle);
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 3 - 3
tests/datawizard/interfaces/block/block_opencl.c

@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-		size_t global = nx * ny * nz;
+		size_t global[3] = {nx, ny, nz};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 15 - 23
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 			   int ldy, int ldz,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	int val = idz*ny*nx+idy*nx+idx;
+	int i = (idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k;
-	int val = 0;
-	for (k = 0; k < nz; k++)
-	{
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					block[(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	}
+	if (block[i] != factor * val)
+		*err = 1;
+	else
+		block[i] *= -1;
 }

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_interface.c

@@ -18,7 +18,7 @@
 #include "../test_interfaces.h"
 #include "../../../helper.h"
 
-#define NX 16
+#define NX 4
 #define NY NX
 #define NZ NX
 #define NT NX

+ 3 - 3
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-                size_t global = 1;
+		size_t global[3] = {nx, ny, nz*nt};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 18 - 26
tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl

@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 			   int ldy, int ldz, int ldt,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2) % nz;
+	const int idt = get_global_id(2) / nz;
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+	if (idt >= nt)
+		return;
+
+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k, l;
-	int val = 0;
-	for (l = 0; l < nt; l++)
-	{
-	    for (k = 0; k < nz; k++)
-	    {
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	    }
-	}
+	if (tensor[i] != factor * val)
+		*err = 1;
+	else
+		tensor[i] *= -1;
 }

+ 8 - 0
tests/energy/energy_efficiency.c

@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
 	unsigned N, k, m, n, iter, NITER;
 	if (argc < 2)
+#ifdef STARPU_QUICK_CHECK
+		N = 10;
+#else
 		N = 40;
+#endif
 	else
 		N = atoi(argv[1]);
 	if (argc < 3)
+#ifdef STARPU_QUICK_CHECK
+		NITER = 3;
+#else
 		NITER = 10;
+#endif
 	else
 		NITER = atoi(argv[2]);
 	if (N == 0)

+ 1 - 0
tests/overlap/overlap.c

@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 95 - 65
tools/gdbinit

@@ -39,33 +39,6 @@ define starpu-print-task
   set $task = (struct starpu_task *)$arg0
   set $job = (struct _starpu_job *)$task->starpu_private
   set $status=0
-  if $task->status == 0
-    set $status="STARPU_TASK_INIT"
-  end
-  if $task->status == 1
-    set $status="STARPU_TASK_BLOCKED"
-  end
-  if $task->status == 2
-    set $status="STARPU_TASK_READY"
-  end
-  if $task->status == 3
-    set $status="STARPU_TASK_RUNNING"
-  end
-  if $task->status == 4
-    set $status="STARPU_TASK_FINISHED"
-  end
-  if $task->status == 5
-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
-  end
-  if $task->status == 6
-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
-  end
-  if $task->status == 7
-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
-  end
-  if $task->status == 8
-    set $status="STARPU_TASK_STOPPED"
-  end
 
   printf "StarPU Task (%p)\n", $task
   if $task->name
@@ -81,13 +54,42 @@ define starpu-print-task
   end
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
-  printf "\tstatus:\t\t\t\t<%s>\n", $status
+  printf "\tstatus:\t\t\t\t"
+  if $task->status == 0
+    printf "STARPU_TASK_INIT"
+  end
+  if $task->status == 1
+    printf "STARPU_TASK_BLOCKED"
+  end
+  if $task->status == 2
+    printf "STARPU_TASK_READY"
+  end
+  if $task->status == 3
+    printf "STARPU_TASK_RUNNING"
+  end
+  if $task->status == 4
+    printf "STARPU_TASK_FINISHED"
+  end
+  if $task->status == 5
+    printf "STARPU_TASK_BLOCKED_ON_TAG"
+  end
+  if $task->status == 6
+    printf "STARPU_TASK_BLOCKED_ON_TASK"
+  end
+  if $task->status == 7
+    printf "STARPU_TASK_BLOCKED_ON_DATA"
+  end
+  if $task->status == 8
+    printf "STARPU_TASK_STOPPED"
+  end
+  printf "\n"
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
@@ -169,35 +171,36 @@ define starpu-workers
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   while $num<_starpu_config->topology->nworkers
     set $worker=&_starpu_config->workers[$num]
+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
     if $worker->status == STATUS_INVALID
-      set $status="INVALID"
+      printf "INVALID"
     end
     if $worker->status == STATUS_UNKNOWN
-      set $status="UNKNOWN"
+      printf "UNKNOWN"
     end
     if $worker->status == STATUS_INITIALIZING
-      set $status="INITIALIZING"
+      printf "INITIALIZING"
     end
     if $worker->status == STATUS_EXECUTING
-      set $status="EXECUTING"
+      printf "EXECUTING"
     end
     if $worker->status == STATUS_CALLBACK
-      set $status="CALLBACK"
+      printf "CALLBACK"
     end
     if $worker->status == STATUS_SCHEDULING
-      set $status="SCHEDULING"
+      printf "SCHEDULING"
     end
     if $worker->status == STATUS_WAITING
-      set $status="WAITING"
+      printf "WAITING"
     end
     if $worker->status == STATUS_SLEEPING_SCHEDULING
-      set $status="SLEEPING_SCHEDULING"
+      printf "SLEEPING_SCHEDULING"
     end
     if $worker->status == STATUS_SLEEPING
-      set $status="SLEEPING"
+      printf "SLEEPING"
     end
-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
+    printf "\n"
     set $num = $num + 1
   end
 end
@@ -205,23 +208,24 @@ end
 define starpu-print-tag
   set language c
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\tstate "
   if $tag_struct->state == STARPU_INVALID_STATE
-     set $status="STARPU_INVALID_STATE"
+     printf "STARPU_INVALID_STATE"
   end
   if $tag_struct->state == STARPU_ASSOCIATED
-     set $status="STARPU_ASSOCIATED"
+     printf "STARPU_ASSOCIATED"
   end
   if $tag_struct->state == STARPU_BLOCKED
-     set $status="STARPU_BLOCKED"
+     printf "STARPU_BLOCKED"
   end
   if $tag_struct->state == STARPU_READY
-     set $status="STARPU_READY"
+     printf "STARPU_READY"
   end
   if $tag_struct->state == STARPU_DONE
-     set $status="STARPU_DONE"
+     printf "STARPU_DONE"
   end
-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
-  printf "\tstate %s\n", $status
+  printf "\n"
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
@@ -317,7 +321,11 @@ define starpu-all-tasks
     while $l != &all_jobs_list
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $task = $j->task
-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
+      if $task->name
+        printf "task %p %s\n", $task, $task->name
+      else
+        printf "task %p\n", $task
+      end
       set $l = $l->next
     end
   end
@@ -915,9 +923,9 @@ end
 define starpu-sched-print-component
     set $c = (struct starpu_sched_component *) $arg1
     starpu-print-spaces $arg0
-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
     if $c->push_task == fifo_push_task
-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
       starpu-print-spaces $arg0
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
     end
@@ -951,29 +959,29 @@ end
 
 define starpu-mpi-print-request
     set $request = (struct _starpu_mpi_req *)$arg0
-    set $request_type = "unknown_type"
+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
     if $request->request_type == SEND_REQ
-       set $request_type = "SEND_REQ"
+       printf "SEND_REQ"
     end
     if $request->request_type == RECV_REQ
-       set $request_type = "RECV_REQ"
+       printf "RECV_REQ"
     end
     if $request->request_type == WAIT_REQ
-       set $request_type = "WAIT_REQ"
+       printf "WAIT_REQ"
     end
     if $request->request_type == TEST_REQ
-       set $request_type = "TEST_REQ"
+       printf "TEST_REQ"
     end
     if $request->request_type == BARRIER_REQ
-       set $request_type = "BARRIER_REQ"
+       printf "BARRIER_REQ"
     end
     if $request->request_type == PROBE_REQ
-       set $request_type = "PROBE_REQ"
+       printf "PROBE_REQ"
     end
     if $request->request_type == UNKNOWN_REQ
-       set $request_type = "UNKNOWN_REQ"
+       printf "UNKNOWN_REQ"
     end
-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
 end
 
 define starpu-mpi-print-ready-recv-requests
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
     end
 end
 
+define starpu-mpi-print-requests-list
+  set $list = $arg0
+  set $request = $list->_head
+  while $request
+    starpu-mpi-print-request $request
+    set $request = $request->_next
+  end
+end
+
+define starpu-mpi-print-requests-tree
+  if $arg0
+    starpu-mpi-print-requests-tree $arg0->children[0]
+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
+    starpu-mpi-print-requests-list (&($stage->list))
+    starpu-mpi-print-requests-tree $arg0->children[1]
+  end
+end
+
 define starpu-mpi-print-ready-send-requests
-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
-    if $list
-	set $request = $list.list._head
-        while $request
-            starpu-mpi-print-request $request
-	    set $request = $request->_next
-	end
+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
+  if _starpu_debug
+    if $prio_list
+        starpu-mpi-print-requests-list &$prio_list.list
+    else
+	printf "No ready send requests\n"
+    end
+  else
+    if $prio_list.empty == 0
+        starpu-mpi-print-requests-tree $prio_list.tree.root
     else
 	printf "No ready send requests\n"
     end
+  end
 end
 
 define starpu-mpi-print-detached-requests