Explorar o código

Merge from trunk @11526:11613

Marc Sergent %!s(int64=11) %!d(string=hai) anos
pai
achega
5bbe09ce59
Modificáronse 86 ficheiros con 12075 adicións e 425 borrados
  1. 5 3
      ChangeLog
  2. 2 7
      configure.ac
  3. 12 0
      doc/doxygen/Makefile.am
  4. 1 1
      doc/doxygen/chapters/advanced_examples.doxy
  5. 3 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  6. BIN=BIN
      doc/doxygen/chapters/data_trace.pdf
  7. 4283 0
      doc/doxygen/chapters/distrib_data.eps
  8. BIN=BIN
      doc/doxygen/chapters/distrib_data.pdf
  9. BIN=BIN
      doc/doxygen/chapters/distrib_data.png
  10. 1613 0
      doc/doxygen/chapters/distrib_data_histo.eps
  11. BIN=BIN
      doc/doxygen/chapters/distrib_data_histo.pdf
  12. BIN=BIN
      doc/doxygen/chapters/distrib_data_histo.png
  13. 13 1
      doc/doxygen/chapters/performance_feedback.doxy
  14. 1036 0
      doc/doxygen/chapters/starpu_chol_model_11_type.eps
  15. BIN=BIN
      doc/doxygen/chapters/starpu_chol_model_11_type.pdf
  16. BIN=BIN
      doc/doxygen/chapters/starpu_chol_model_11_type.png
  17. 4460 0
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.eps
  18. BIN=BIN
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.pdf
  19. BIN=BIN
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.png
  20. 4 4
      examples/interface/complex.c
  21. 1 1
      examples/interface/complex_interface.c
  22. 34 1
      examples/stencil/Makefile.am
  23. 3 13
      examples/stencil/life_opencl.c
  24. 4 14
      examples/stencil/shadow_opencl.c
  25. 52 3
      examples/stencil/stencil-blocks.c
  26. 16 13
      examples/stencil/stencil-kernels.c
  27. 28 23
      examples/stencil/stencil-tasks.c
  28. 36 19
      examples/stencil/stencil.c
  29. 4 0
      examples/stencil/stencil.h
  30. 4 1
      include/starpu_task.h
  31. 3 3
      mpi/examples/complex/mpi_complex.c
  32. 3 3
      mpi/src/starpu_mpi.c
  33. 4 4
      mpi/tests/datatypes.c
  34. 2 2
      mpi/tests/gather.c
  35. 2 2
      mpi/tests/gather2.c
  36. 5 2
      mpi/tests/insert_task_owner2.c
  37. 9 5
      mpi/tests/insert_task_owner_data.c
  38. 1 1
      mpi/tests/user_defined_datatype_value.h
  39. 2 3
      socl/src/Makefile.am
  40. 39 1
      src/common/fxt.h
  41. 3 6
      src/core/dependencies/implicit_data_deps.c
  42. 1 5
      src/core/jobs.h
  43. 7 6
      src/core/perfmodel/perfmodel_history.c
  44. 19 0
      src/core/perfmodel/perfmodel_nan.c
  45. 32 10
      src/core/sched_ctx.c
  46. 2 1
      src/core/sched_policy.c
  47. 10 7
      src/core/task.c
  48. 4 4
      src/core/workers.c
  49. 2 0
      src/core/workers.h
  50. 3 0
      src/datawizard/copy_driver.c
  51. 6 1
      src/datawizard/datastats.c
  52. 2 0
      src/datawizard/filters.c
  53. 8 8
      src/datawizard/interfaces/bcsr_interface.c
  54. 6 6
      src/datawizard/interfaces/block_interface.c
  55. 3 3
      src/datawizard/interfaces/coo_interface.c
  56. 4 4
      src/datawizard/interfaces/csr_interface.c
  57. 2 2
      src/datawizard/interfaces/data_interface.c
  58. 5 5
      src/datawizard/interfaces/matrix_interface.c
  59. 3 3
      src/datawizard/interfaces/multiformat_interface.c
  60. 3 3
      src/datawizard/interfaces/variable_interface.c
  61. 4 4
      src/datawizard/interfaces/vector_interface.c
  62. 6 2
      src/datawizard/malloc.c
  63. 9 1
      src/datawizard/memalloc.c
  64. 3 0
      src/datawizard/reduction.c
  65. 6 16
      src/datawizard/user_interactions.c
  66. 65 14
      src/debug/traces/starpu_fxt.c
  67. 6 2
      src/debug/traces/starpu_paje.c
  68. 1 0
      src/sched_policies/fifo_queues.c
  69. 2 8
      src/util/execute_on_all.c
  70. 4 4
      src/util/misc.c
  71. 2 1
      src/util/starpu_create_sync_task.c
  72. 1 0
      src/util/starpu_data_cpy.c
  73. 2 0
      src/util/starpu_task_insert.c
  74. 1 0
      tests/Makefile.am
  75. 3 11
      tests/main/driver_api/run_driver.c
  76. 1 1
      tests/microbenchs/prefetch_data_on_node.c
  77. 110 0
      tests/parallel_tasks/cuda_only.c
  78. 9 8
      tests/perfmodels/value_nan.c
  79. 14 6
      tools/gdbinit
  80. 0 75
      tools/model.sh
  81. 1 1
      tools/starpu_codelet_histo_profile.in
  82. 1 1
      tools/starpu_fxt_stats.c
  83. 2 2
      tools/starpu_perfmodel_display.c
  84. 5 4
      tools/starpu_perfmodel_plot.c
  85. 0 60
      tools/valgrind/fscanf.suppr
  86. 8 0
      tools/valgrind/pthread.suppr

+ 5 - 3
ChangeLog

@@ -76,12 +76,14 @@ Small features:
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     names are kept to avoid breaking old codes.
   * New configure option --enable-calibration-heuristic which allows
-    the user to set the maximum authorized deviation of the 
-    history-based calibrator. 
+    the user to set the maximum authorized deviation of the
+    history-based calibrator.
+  * Tasks can now have a name (via the field const char *name of
+    struct starpu_task)
 
 Changes:
   * Fix of the livelock issue discovered while executing applications
-    on a CPU+GPU cluster of machines by adding a maximum trylock 
+    on a CPU+GPU cluster of machines by adding a maximum trylock
     threshold before a blocking lock.
   * Data interfaces (variable, vector, matrix and block) now define
     pack und unpack functions

+ 2 - 7
configure.ac

@@ -1606,13 +1606,8 @@ AC_MSG_CHECKING(calibration heuristic of history-based StarPU calibrator)
 AC_ARG_ENABLE(calibration-heuristic, [AS_HELP_STRING([--enable-calibration-heuristic=<number>],
 			[Define the maximum authorized deviation of StarPU history-based calibrator.])],
 			calibration_heuristic=$enableval, calibration_heuristic=10)
-if test $calibration_heuristic -gt 100; then
-	AC_MSG_RESULT(uncorrect parameter $calibration_heuristic  set default parameter 10)
-	AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibration heuristic value])
-else
-	AC_MSG_RESULT($calibration_heuristic)
-	AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibration heuristic value])
-fi
+AC_MSG_RESULT($calibration_heuristic)
+AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibration heuristic value])
 
 
 ###############################################################################

+ 12 - 0
doc/doxygen/Makefile.am

@@ -145,9 +145,21 @@ EXTRA_DIST	= 					\
 	chapters/data_trace.eps				\
 	chapters/data_trace.pdf				\
 	chapters/data_trace.png				\
+	chapters/distrib_data.png	\
+	chapters/distrib_data.eps	\
+	chapters/distrib_data.pdf	\
+	chapters/distrib_data_histo.png	\
+	chapters/distrib_data_histo.eps	\
+	chapters/distrib_data_histo.pdf	\
+	chapters/starpu_chol_model_11_type.png	\
+	chapters/starpu_chol_model_11_type.eps	\
+	chapters/starpu_chol_model_11_type.pdf	\
 	chapters/starpu_non_linear_memset_regression_based.png	\
 	chapters/starpu_non_linear_memset_regression_based.eps	\
 	chapters/starpu_non_linear_memset_regression_based.pdf	\
+	chapters/starpu_non_linear_memset_regression_based_2.png	\
+	chapters/starpu_non_linear_memset_regression_based_2.eps	\
+	chapters/starpu_non_linear_memset_regression_based_2.pdf	\
 	chapters/starpu_starpu_slu_lu_model_11.png	\
 	chapters/starpu_starpu_slu_lu_model_11.eps	\
 	chapters/starpu_starpu_slu_lu_model_11.pdf	\

+ 1 - 1
doc/doxygen/chapters/advanced_examples.doxy

@@ -1103,7 +1103,7 @@ complex interface from a StarPU data handle.
 double *starpu_complex_get_real(starpu_data_handle_t handle)
 {
         struct starpu_complex_interface *complex_interface =
-          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
         return complex_interface->real;
 }
 

+ 3 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -331,6 +331,9 @@ starpu_task_create(), or declared statically. In the latter case, the
 programmer has to zero the structure starpu_task and to fill the
 different fields properly. The indicated default values correspond to
 the configuration of a task allocated with starpu_task_create().
+\var starpu_task::name
+Optional name of the task. This can be useful for debugging
+purposes.
 \var starpu_task::cl
 Is a pointer to the corresponding structure starpu_codelet. This
 describes where the kernel should be executed, and supplies the

BIN=BIN
doc/doxygen/chapters/data_trace.pdf


A diferenza do arquivo foi suprimida porque é demasiado grande
+ 4283 - 0
doc/doxygen/chapters/distrib_data.eps


BIN=BIN
doc/doxygen/chapters/distrib_data.pdf


BIN=BIN
doc/doxygen/chapters/distrib_data.png


A diferenza do arquivo foi suprimida porque é demasiado grande
+ 1613 - 0
doc/doxygen/chapters/distrib_data_histo.eps


BIN=BIN
doc/doxygen/chapters/distrib_data_histo.pdf


BIN=BIN
doc/doxygen/chapters/distrib_data_histo.png


+ 13 - 1
doc/doxygen/chapters/performance_feedback.doxy

@@ -430,12 +430,15 @@ When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
 
 \verbatim
-$ starpu_perfmodel_display -f -s chol_model_11
+$ starpu_perfmodel_plot -f -s chol_model_11
 \endverbatim
 
 This will however disable displaying the regression model, for which we can not
 compute GFlops.
 
+\image html starpu_chol_model_11_type.png
+\image latex starpu_chol_model_11_type.eps "" width=\textwidth
+
 When the FxT trace file <c>filename</c> has been generated, it is possible to
 get a profiling of each codelet by calling:
 
@@ -448,6 +451,9 @@ This will create profiling data files, and a <c>.gp</c> file in the current
 directory, which draws the distribution of codelet time over the application
 execution, according to data input size.
 
+\image html distrib_data.png
+\image latex distrib_data.eps "" width=\textwidth
+
 This is also available in the tool <c>starpu_perfmodel_plot</c>, by passing it
 the fxt trace:
 
@@ -458,6 +464,9 @@ $ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_
 It will produce a <c>.gp</c> file which contains both the performance model
 curves, and the profiling measurements.
 
+\image html starpu_non_linear_memset_regression_based_2.png
+\image latex starpu_non_linear_memset_regression_based_2.eps "" width=\textwidth
+
 If you have the statistical tool <c>R</c> installed, you can additionally use
 
 \verbatim
@@ -467,6 +476,9 @@ $ starpu_codelet_histo_profile distrib.data
 Which will create one <c>.pdf</c> file per codelet and per input size, showing a
 histogram of the codelet execution time distribution.
 
+\image html distrib_data_histo.png
+\image latex distrib_data_histo.eps "" width=\textwidth
+
 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
 
 StarPU can record a trace of what tasks are needed to complete the

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 1036 - 0
doc/doxygen/chapters/starpu_chol_model_11_type.eps


BIN=BIN
doc/doxygen/chapters/starpu_chol_model_11_type.pdf


BIN=BIN
doc/doxygen/chapters/starpu_chol_model_11_type.png


A diferenza do arquivo foi suprimida porque é demasiado grande
+ 4460 - 0
doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.eps


BIN=BIN
doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.pdf


BIN=BIN
doc/doxygen/chapters/starpu_non_linear_memset_regression_based_2.png


+ 4 - 4
examples/interface/complex.c

@@ -93,11 +93,11 @@ int main(int argc, char **argv)
 	starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
 	starpu_complex_data_register(&handle2, STARPU_MAIN_RAM, &copy_real, &copy_imaginary, 1);
 
-	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1")+1, STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2")+1, STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
@@ -122,11 +122,11 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1")+1, STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2")+1, STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 

+ 1 - 1
examples/interface/complex_interface.c

@@ -97,7 +97,7 @@ static void complex_free_data_on_node(void *data_interface, unsigned node)
 static size_t complex_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	size = complex_interface->nx * 2 * sizeof(double);
 	return size;

+ 34 - 1
examples/stencil/Makefile.am

@@ -40,11 +40,44 @@ NVCCFLAGS += $(HWLOC_CFLAGS)
 
 endif
 
+#####################################
+# What to install and what to check #
+#####################################
+
+if STARPU_HAVE_WINDOWS
+check_PROGRAMS	=	$(STARPU_EXAMPLES)
+else
+check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
+endif
+
+TESTS		=	$(STARPU_EXAMPLES)
+
+if !STARPU_HAVE_WINDOWS
+## test loader program
+if !STARPU_CROSS_COMPILING
+LOADER			=	loader
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER_BIN		=	./$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+else
+LOADER			=
+LOADER_BIN		=	$(top_builddir)/tests/loader-cross.sh
+endif
+
+if STARPU_HAVE_AM111
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER		=	$(LOADER_BIN)
+else
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
+endif
+
+endif
+
 ###################
 # stencil example #
 ###################
 
-check_PROGRAMS =				\
+STARPU_EXAMPLES =				\
 	stencil
 
 examplebindir = $(libdir)/starpu/examples/stencil

+ 3 - 13
examples/stencil/life_opencl.c

@@ -84,20 +84,10 @@ void opencl_life_free(void)
 void
 opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
 {
-	unsigned max_parallelism = 512;
-	unsigned threads_per_dim_x = max_parallelism;
-	while (threads_per_dim_x / 2 >= nx)
-		threads_per_dim_x /= 2;
-	unsigned threads_per_dim_y = max_parallelism / threads_per_dim_x;
-	while (threads_per_dim_y / 2 >= ny)
-		threads_per_dim_y /= 2;
 #if 0
-	unsigned threads_per_dim_z = 4;
-	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, threads_per_dim_z};
-	size_t dimGrid[] = {nx / threads_per_dim_x, ny / threads_per_dim_y, nz / threads_per_dim_z};
+	size_t dim[] = {nx, ny, nz};
 #else
-	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, 1};
-	size_t dimGrid[] = {((nx + threads_per_dim_x-1) / threads_per_dim_x)*threads_per_dim_x, ((ny + threads_per_dim_y-1) / threads_per_dim_y)*threads_per_dim_y, 1};
+	size_t dim[] = {nx, ny, 1};
 #endif
 
   int devid,id;
@@ -119,7 +109,7 @@ opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int
   clSetKernelArg(kernel, 8, sizeof(iter), &iter);
 
   cl_event ev;
-  clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dimGrid, dimBlock, 0, NULL, &ev);
+  clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, &ev);
   clWaitForEvents(1, &ev);
   starpu_opencl_collect_stats(ev);
   clReleaseEvent(ev);

+ 4 - 14
examples/stencil/shadow_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -81,20 +81,10 @@ void opencl_shadow_free(void)
 void
 opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)
 {
-	unsigned max_parallelism = 512;
-	unsigned threads_per_dim_x = max_parallelism;
-	while (threads_per_dim_x / 2 >= nx)
-		threads_per_dim_x /= 2;
-	unsigned threads_per_dim_y = max_parallelism / threads_per_dim_x;
-	while (threads_per_dim_y / 2 >= ny)
-		threads_per_dim_y /= 2;
 #if 0
-	unsigned threads_per_dim_z = 4;
-	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, threads_per_dim_z};
-	size_t dimGrid[] = {nx / threads_per_dim_x, ny / threads_per_dim_y, nz / threads_per_dim_z};
+	size_t dim[] = {nx, ny, nz};
 #else
-	size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, 1};
-	size_t dimGrid[] = {((nx + threads_per_dim_x-1) / threads_per_dim_x)*threads_per_dim_x, ((ny + threads_per_dim_y-1) / threads_per_dim_y)*threads_per_dim_y, 1};
+	size_t dim[] = {nx, ny, 1};
 #endif
 
         int devid,id;
@@ -115,7 +105,7 @@ opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz,
         clSetKernelArg(kernel, 7, sizeof(i), &i);
 
         cl_event ev;
-        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dimGrid, dimBlock, 0, NULL, &ev);
+        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, &ev);
         if (err != CL_SUCCESS)
                 STARPU_OPENCL_REPORT_ERROR(err);
         clWaitForEvents(1, &ev);

+ 52 - 3
examples/stencil/stencil-blocks.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -157,6 +157,12 @@ void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsi
 	}
 }
 
+void free_blocks_array()
+{
+	free(blocks);
+	free(block_sizes_z);
+}
+
 /*
  *	Initialization of the blocks
  */
@@ -265,9 +271,16 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
 }
 
+static void free_block_on_node(starpu_data_handle_t handleptr)
+{
+	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
+	starpu_data_unregister(handleptr);
+	starpu_free(ptr);
+}
+
 void display_memory_consumption(int rank)
 {
-	fprintf(stderr, "%lu MB of memory were allocated on node %d\n", allocated/(1024*1024), rank);
+	FPRINTF(stderr, "%lu B of memory were allocated on node %d\n", allocated, rank);
 }
 
 void allocate_memory_on_node(int rank)
@@ -280,7 +293,7 @@ void allocate_memory_on_node(int rank)
 		int node = block->mpi_node;
 
 		unsigned size_bz = block_sizes_z[bz];
-	
+
 		/* Main blocks */
 		if (node == rank)
 		{
@@ -322,6 +335,42 @@ void allocate_memory_on_node(int rank)
 	}
 }
 
+void free_memory_on_node(int rank)
+{
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+
+		int node = block->mpi_node;
+
+		unsigned size_bz = block_sizes_z[bz];
+
+		/* Main blocks */
+		if (node == rank)
+		{
+			free_block_on_node(block->layers_handle[0]);
+			free_block_on_node(block->layers_handle[1]);
+		}
+
+		/* Boundary blocks : Top */
+		int top_node = block->boundary_blocks[T]->mpi_node;
+		if ((node == rank) || (top_node == rank))
+		{
+			free_block_on_node(block->boundaries_handle[T][0]);
+			free_block_on_node(block->boundaries_handle[T][1]);
+		}
+
+		/* Boundary blocks : Bottom */
+		int bottom_node = block->boundary_blocks[B]->mpi_node;
+		if ((node == rank) || (bottom_node == rank))
+		{
+			free_block_on_node(block->boundaries_handle[B][0]);
+			free_block_on_node(block->boundaries_handle[B][1]);
+		}
+	}
+}
+
 /* check how many cells are alive */
 void check(int rank)
 {

+ 16 - 13
examples/stencil/stencil-kernels.c

@@ -18,8 +18,6 @@
 #include "stencil.h"
 #include <sys/time.h>
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-
 #ifndef timersub
 #define	timersub(x, y, res) \
 	do \
@@ -216,9 +214,9 @@ static void update_func_cuda(void *descr[], void *arg)
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
-fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
+		FPRINTF(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
 	else
-	DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
+		DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
 #ifdef STARPU_USE_MPI
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -289,8 +287,8 @@ static void load_subblock_from_buffer_opencl(struct starpu_block_interface *bloc
 	size_t boundary_size = K*block->ldz*block->elemsize;
 
 	unsigned offset = firstz*block->ldz;
-	cl_mem block_data = (cl_mem)block->ptr;
-	cl_mem boundary_data = (cl_mem)boundary->ptr;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
 	cl_event event;
 
         cl_command_queue cq;
@@ -311,9 +309,9 @@ static void update_func_opencl(void *descr[], void *arg)
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
-fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
+		FPRINTF(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
 	else
-	DEBUG( "!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
+		DEBUG( "!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, workerid);
 #ifdef STARPU_USE_MPI
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -351,7 +349,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 	for (i=1; i<=K; i++)
 	{
 		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
-		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
+		TYPE *old = (void*) oldb->dev_handle, *newer = (void*) newb->dev_handle;
 
 		/* Shadow data */
 		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
@@ -361,15 +359,19 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 		cl_event event;
-                clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+                cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+		if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+
 		clWaitForEvents(1, &event);
 		clReleaseEvent(event);
 #endif /* LIFE */
 	}
 
+#ifndef LIFE
 	cl_int err;
 	if ((err = clFinish(cq)))
 		STARPU_OPENCL_REPORT_ERROR(err);
+#endif
 
 	if (block->bz == 0)
 		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
@@ -526,14 +528,15 @@ static void load_subblock_into_buffer_opencl(struct starpu_block_interface *bloc
 	size_t boundary_size = K*block->ldz*block->elemsize;
 
 	unsigned offset = firstz*block->ldz;
-	cl_mem block_data = (cl_mem)block->ptr;
-	cl_mem boundary_data = (cl_mem)boundary->ptr;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
 
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
 	cl_event event;
 
-        clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
+        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
 
 	clWaitForEvents(1, &event);
 	clReleaseEvent(event);

+ 28 - 23
examples/stencil/stencil-tasks.c

@@ -32,7 +32,7 @@
 #if 0
 # define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
 #else
-# define DEBUG(fmt, ...) 
+# define DEBUG(fmt, ...)
 #endif
 
 /*
@@ -65,7 +65,7 @@ static void create_task_save_local(unsigned iter, unsigned z, int dir, int local
 	int ret = starpu_task_submit(save_task);
 	if (ret)
 	{
-		fprintf(stderr, "Could not submit task save: %d\n", ret);
+		FPRINTF(stderr, "Could not submit task save: %d\n", ret);
 		STARPU_ABORT();
 	}
 }
@@ -179,7 +179,6 @@ void create_task_update(unsigned iter, unsigned z, int local_rank)
 	/* We are going to synchronize with the last tasks */
 	if (iter == niter)
 	{
-		task->detach = 0;
 		task->use_tag = 1;
 		task->tag_id = TAG_FINISH(z);
 	}
@@ -207,7 +206,7 @@ void create_task_update(unsigned iter, unsigned z, int local_rank)
 	int ret = starpu_task_submit(task);
 	if (ret)
 	{
-		fprintf(stderr, "Could not submit task update block: %d\n", ret);
+		FPRINTF(stderr, "Could not submit task update block: %d\n", ret);
 		STARPU_ABORT();
 	}
 }
@@ -243,7 +242,7 @@ void create_start_task(int z, int dir)
 	int ret = starpu_task_submit(wait_init);
 	if (ret)
 	{
-		fprintf(stderr, "Could not submit task initial wait: %d\n", ret);
+		FPRINTF(stderr, "Could not submit task initial wait: %d\n", ret);
 		STARPU_ABORT();
 	}
 }
@@ -269,23 +268,23 @@ void create_tasks(int rank)
 
 	for (iter = 0; iter <= niter; iter++)
 	{
-	for (bz = 0; bz < nbz; bz++)
-	{
-		if ((iter > 0) && (get_block_mpi_node(bz) == rank))
-			create_task_update(iter, bz, rank);
-
-	}
-	for (bz = 0; bz < nbz; bz++)
-	{
-		if (iter != niter)
-		{
-			if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
-				create_task_save(iter, bz, +1, rank);
-	
-			if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
-				create_task_save(iter, bz, -1, rank);
-		}
-	}
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
+			  create_task_update(iter, bz, rank);
+
+	     }
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		     if (iter != niter)
+		     {
+			     if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
+				     create_task_save(iter, bz, +1, rank);
+
+			     if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
+				     create_task_save(iter, bz, -1, rank);
+		     }
+	     }
 	}
 }
 
@@ -308,7 +307,13 @@ void wait_end_tasks(int rank)
 			struct block_description *block = get_block_description(bz);
 			starpu_data_acquire(block->layers_handle[0], STARPU_R);
 			starpu_data_acquire(block->layers_handle[1], STARPU_R);
+			/* the data_acquire here is done to make sure
+			 * the data is sent back to the ram memory, we
+			 * can safely do a data_release, to avoid the
+			 * data_unregister to block later on
+			 */
+			starpu_data_release(block->layers_handle[0]);
+			starpu_data_release(block->layers_handle[1]);
 		}
 	}
 }
-

+ 36 - 19
examples/stencil/stencil.c

@@ -23,10 +23,15 @@
 /* default parameter values */
 static unsigned  bind_tasks = 0;
 
-static unsigned niter = 32;
 static unsigned ticks = 1000;
 
+#ifdef STARPU_QUICK_CHECK
+static unsigned niter = 4;
+#define SIZE 16
+#else
+static unsigned niter = 32;
 #define SIZE 128
+#endif
 
 /* Problem size */
 static unsigned sizex = SIZE;
@@ -147,6 +152,15 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 	last_tick = (struct timeval *) calloc(nbz, sizeof(*last_tick));
 }
 
+static void free_problem(int rank)
+{
+     	free_memory_on_node(rank);
+	free_blocks_array();
+	free(who_runs_what);
+	free(who_runs_what_index);
+	free(last_tick);
+}
+
 /*
  *	Main body
  */
@@ -168,7 +182,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 		{
 			char name[32];
 			starpu_worker_get_name(worker, name, sizeof(name));
-			fprintf(stderr,"\t%s -> %d (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+			FPRINTF(stderr,"\t%s -> %d (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
 		}
 	}
 }
@@ -196,12 +210,12 @@ int main(int argc, char **argv)
 	int thread_support;
 	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
 	{
-		fprintf(stderr, "MPI_Init_thread failed\n");
+		FPRINTF(stderr, "MPI_Init_thread failed\n");
 	}
 	if (thread_support == MPI_THREAD_FUNNELED)
-		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+		FPRINTF(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
 	if (thread_support < MPI_THREAD_FUNNELED)
-		fprintf(stderr,"Warning: MPI does not have thread support!\n");
+		FPRINTF(stderr,"Warning: MPI does not have thread support!\n");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 #else
@@ -211,7 +225,7 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		fprintf(stderr, "Running on %d nodes\n", world_size);
+		FPRINTF(stderr, "Running on %d nodes\n", world_size);
 		fflush(stderr);
 	}
 
@@ -238,7 +252,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 #endif
 	if (rank == 0)
-		fprintf(stderr, "GO !\n");
+		FPRINTF(stderr, "GO !\n");
 
 	gettimeofday(&start, NULL);
 
@@ -290,6 +304,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
 
 	memcpy(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len * sizeof(*who_runs_what));
+	free(who_runs_what_tmp);
 
 	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
 	int *who_runs_what_index_tmp = malloc(nbz * sizeof(*who_runs_what_index));
@@ -297,16 +312,17 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
 
 	memcpy(who_runs_what_index, who_runs_what_index_tmp, nbz * sizeof(*who_runs_what_index));
+	free(who_runs_what_index_tmp);
 #endif
 
 	if (rank == 0)
 	{
 #if 1 
-		fprintf(stderr, "update:\n");
+		FPRINTF(stderr, "update:\n");
 		f(update_per_worker);
-		fprintf(stderr, "top:\n");
+		FPRINTF(stderr, "top:\n");
 		f(top_per_worker);
-		fprintf(stderr, "bottom:\n");
+		FPRINTF(stderr, "bottom:\n");
 		f(bottom_per_worker);
 #endif
 #if 1
@@ -320,20 +336,20 @@ int main(int argc, char **argv)
 			for (bz = 0; bz < nbz; bz++)
 			{
 				if ((bz % nzblocks_per_process) == 0)
-					fprintf(stderr, "| ");
+					FPRINTF(stderr, "| ");
 
 				if (who_runs_what_index[bz] <= iter)
-					fprintf(stderr,"_ ");
+					FPRINTF(stderr,"_ ");
 				else
 				{
 					last = 0;
 					if (who_runs_what[bz + iter * nbz] == -1)
-						fprintf(stderr,"* ");
+						FPRINTF(stderr,"* ");
 					else
-						fprintf(stderr, "%d ", who_runs_what[bz + iter * nbz]);
+						FPRINTF(stderr, "%d ", who_runs_what[bz + iter * nbz]);
 				}
 			}
-			fprintf(stderr, "\n");
+			FPRINTF(stderr, "\n");
 
 			if (last)
 				break;
@@ -342,12 +358,13 @@ int main(int argc, char **argv)
 
 		fflush(stderr);
 
-		fprintf(stdout, "Computation took: %f ms on %d MPI processes\n", max_timing/1000, world_size);
-		fprintf(stdout, "\tMIN : %f ms\n", min_timing/1000);
-		fprintf(stdout, "\tMAX : %f ms\n", max_timing/1000);
-		fprintf(stdout, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
+		FPRINTF(stdout, "Computation took: %f ms on %d MPI processes\n", max_timing/1000, world_size);
+		FPRINTF(stdout, "\tMIN : %f ms\n", min_timing/1000);
+		FPRINTF(stdout, "\tMAX : %f ms\n", max_timing/1000);
+		FPRINTF(stdout, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
 	}
 
+	free_problem(rank);
 	starpu_shutdown();
 
 #ifdef STARPU_USE_MPI

+ 4 - 0
examples/stencil/stencil.h

@@ -29,6 +29,8 @@
 #endif
 #endif
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 #define LIFE
 
 #ifdef LIFE
@@ -91,6 +93,7 @@ int MPI_TAG1(int z, int iter, int dir);
 #define MIN(a,b)	((a)<(b)?(a):(b))
 
 void create_blocks_array(unsigned sizex, unsigned sizey, unsigned sizez, unsigned nbz);
+void free_blocks_array();
 struct block_description *get_block_description(int z);
 void assign_blocks_to_mpi_nodes(int world_size);
 void allocate_memory_on_node(int rank);
@@ -98,6 +101,7 @@ void assign_blocks_to_workers(int rank);
 void create_tasks(int rank);
 void wait_end_tasks(int rank);
 void check(int rank);
+void free_memory_on_node(int rank);
 
 void display_memory_consumption(int rank);
 

+ 4 - 1
include/starpu_task.h

@@ -164,6 +164,8 @@ struct starpu_task
 	void *starpu_private;
 	int magic;
 
+	const char *name;
+
 	unsigned sched_ctx;
 	int hypervisor_tag;
 	double flops;
@@ -197,7 +199,8 @@ struct starpu_task
 	.flops = 0.0,					\
 	.scheduled = 0,					\
 	.dyn_handles = NULL,				\
-	.dyn_interfaces = NULL				\
+	.dyn_interfaces = NULL,				\
+	.name = NULL                        		\
 }
 
 #define STARPU_TASK_GET_HANDLE(task, i) ((task->dyn_handles) ? task->dyn_handles[i] : task->handles[i])

+ 3 - 3
mpi/examples/complex/mpi_complex.c

@@ -77,17 +77,17 @@ int main(int argc, char **argv)
 		{
 			int *compare_ptr = &compare;
 
-			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value"), STARPU_R, handle, 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
 			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
 			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
 
-			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value"), STARPU_R, handle2, 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
 			starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
 		}
 		else if (rank == 1)
 		{
 			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
-			starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value"), STARPU_R, handle, 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
 			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
 		}
 

+ 3 - 3
mpi/src/starpu_mpi.c

@@ -941,8 +941,8 @@ static void _starpu_mpi_copy_cb(void* arg)
 	else
 	{
 		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
-		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
-		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
+		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle, STARPU_MAIN_RAM);
+		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle, STARPU_MAIN_RAM);
 
 		if (!itf->copy_methods->ram_to_ram)
 		{
@@ -1380,7 +1380,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						 */
 						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
 						chandle->buffer = malloc(chandle->env->size);
-						starpu_vector_data_register(&chandle->handle, 0, (uintptr_t) chandle->buffer, chandle->env->size, 1);
+						starpu_vector_data_register(&chandle->handle, STARPU_MAIN_RAM, (uintptr_t) chandle->buffer, chandle->env->size, 1);
 						add_chandle(chandle);
 					}
 

+ 4 - 4
mpi/tests/datatypes.c

@@ -195,7 +195,7 @@ int main(int argc, char **argv)
 		{
 			float v = 42.12;
 			starpu_data_handle_t variable_handle[2];
-			starpu_variable_data_register(&variable_handle[0], 0, (uintptr_t)&v, sizeof(v));
+			starpu_variable_data_register(&variable_handle[0], STARPU_MAIN_RAM, (uintptr_t)&v, sizeof(v));
 			starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
 
 			send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, &error, check_variable);
@@ -208,7 +208,7 @@ int main(int argc, char **argv)
 			int vector[4] = {1, 2, 3, 4};
 			starpu_data_handle_t vector_handle[2];
 
-			starpu_vector_data_register(&vector_handle[0], 0, (uintptr_t)vector, 4, sizeof(vector[0]));
+			starpu_vector_data_register(&vector_handle[0], STARPU_MAIN_RAM, (uintptr_t)vector, 4, sizeof(vector[0]));
 			starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
 
 			send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, &error, check_vector);
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
 				}
 			}
 
-			starpu_matrix_data_register(&matrix_handle[0], 0, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
+			starpu_matrix_data_register(&matrix_handle[0], STARPU_MAIN_RAM, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
 			starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
 
 			send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, &error, check_matrix);
@@ -260,7 +260,7 @@ int main(int argc, char **argv)
 				}
 			}
 
-			starpu_block_data_register(&block_handle[0], 0, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+			starpu_block_data_register(&block_handle[0], STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 			starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
 
 			send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, &error, check_block);

+ 2 - 2
mpi/tests/gather.c

@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 			MPI_Status status;
 
 			FPRINTF_MPI("receiving from node %d\n", n);
-			starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
+			starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
 			starpu_mpi_recv(handle, n, 42, MPI_COMM_WORLD, &status);
 			starpu_data_acquire(handle, STARPU_R);
 			STARPU_ASSERT_MSG(var == n, "Received incorrect value <%d> from node <%d>\n", var, n);
@@ -62,7 +62,7 @@ int main(int argc, char **argv)
 	{
 		FPRINTF_MPI("sending to node %d\n", 0);
 		var = rank;
-		starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
+		starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
 		starpu_mpi_send(handle, 0, 42, MPI_COMM_WORLD);
 		starpu_data_unregister(handle);
 	}

+ 2 - 2
mpi/tests/gather2.c

@@ -50,7 +50,7 @@ int main(int argc, char **argv)
 
 			FPRINTF_MPI("receiving from node %d\n", n);
 			for(i=0 ; i<2 ; i++)
-				starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
+				starpu_variable_data_register(&handle[i], STARPU_MAIN_RAM, (uintptr_t)&var[i], sizeof(var[i]));
 
 			starpu_mpi_recv(handle[0], n, 42, MPI_COMM_WORLD, &status[0]);
 			starpu_data_acquire(handle[0], STARPU_R);
@@ -81,7 +81,7 @@ int main(int argc, char **argv)
 		var[1] = var[0] * 2;
 		var[2] = var[0] * 4;
 		for(i=0 ; i<3 ; i++)
-			starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
+			starpu_variable_data_register(&handle[i], STARPU_MAIN_RAM, (uintptr_t)&var[i], sizeof(var[i]));
 		starpu_mpi_send(handle[0], 0, 42, MPI_COMM_WORLD);
 		starpu_mpi_send(handle[1], 0, 42, MPI_COMM_WORLD);
 		starpu_mpi_send(handle[2], 0, 44, MPI_COMM_WORLD);

+ 5 - 2
mpi/tests/insert_task_owner2.c

@@ -29,7 +29,7 @@ void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	//*x2 = 45;
 	//*y = 144;
 
-	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d (x2) %d\n", *x0, *x1, *y);
 	*y = (*x0 + *x1) * 100;
 	*x1 = 12;
 	*x2 = 24;
@@ -113,7 +113,10 @@ int main(int argc, char **argv)
 		}
 		starpu_data_unregister(data_handles[i]);
 	}
-        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
+	}
         FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
 
 	free(values);

+ 9 - 5
mpi/tests/insert_task_owner_data.c

@@ -86,13 +86,17 @@ int main(int argc, char **argv)
 		{
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
-			starpu_data_release(data_handles[i]);
-		}
+			starpu_data_release(data_handles[i]);		}
 	}
-	FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
 	ret = 0;
-	if (rank == 0 && (values[0] != 12 || values[1] != 144))
-		ret = EXIT_FAILURE;
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
+		if (values[0] != 12 || values[1] != 144)
+		{
+			ret = EXIT_FAILURE;
+		}
+	}
 
 	starpu_data_unregister(data_handles[0]);
 	starpu_data_unregister(data_handles[1]);

+ 1 - 1
mpi/tests/user_defined_datatype_value.h

@@ -26,7 +26,7 @@ struct starpu_value_interface
 int *starpu_value_get(starpu_data_handle_t handle)
 {
 	struct starpu_value_interface *value_interface =
-		(struct starpu_value_interface *) starpu_data_get_interface_on_node(handle, 0);
+		(struct starpu_value_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	return value_interface->value;
 }
 

+ 2 - 3
socl/src/Makefile.am

@@ -132,6 +132,5 @@ libsocl_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
   cl_getextensionfunctionaddress.c \
   cl_icdgetplatformidskhr.c
 
-
-
-
+showcheck:
+	-cat /dev/null

+ 39 - 1
src/common/fxt.h

@@ -140,6 +140,12 @@
 #define _STARPU_FUT_START_UNPARTITION 0x5154
 #define _STARPU_FUT_END_UNPARTITION 0x5155
 
+#define	_STARPU_FUT_START_FREE		0x5156
+#define	_STARPU_FUT_END_FREE		0x5157
+
+#define	_STARPU_FUT_START_WRITEBACK	0x5158
+#define	_STARPU_FUT_END_WRITEBACK	0x5159
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -249,6 +255,22 @@ do {									\
     }									\
 } while (0);
 
+#ifndef FUT_RAW_PROBE7
+#define FUT_RAW_PROBE7(CODE,P1,P2,P3,P4,P5,P6,P7) do {		\
+		if(fut_active) {					\
+			unsigned long *args __attribute__((unused))=	\
+				fut_getstampedbuffer(CODE,		\
+						     FUT_SIZE(7)); \
+			*(args++)=(unsigned long)(P1);*(args++)=(unsigned long)(P2);*(args++)=(unsigned long)(P3);*(args++)=(unsigned long)(P4);*(args++)=(unsigned long)(P5);*(args++)=(unsigned long)(P6);*(args++)=(unsigned long)(P7);				\
+				}					\
+	} while (0)
+#endif
+
+#ifndef FUT_DO_PROBE7
+#define FUT_DO_PROBE7(CODE,P1,P2,P3,P4,P5,P6,P7) do { \
+        FUT_RAW_PROBE7(FUT_CODE(CODE, 7),P1,P2,P3,P4,P5,P6,P7); \
+} while (0)
+#endif
 
 
 /* workerkind = _STARPU_FUT_CPU_KEY for instance */
@@ -278,7 +300,7 @@ do {									\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), _starpu_gettid());	\
+	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, _starpu_gettid());	\
 } while(0);
 
 #define _STARPU_TRACE_START_CALLBACK(job)	\
@@ -394,6 +416,18 @@ do {										\
 #define _STARPU_TRACE_END_ALLOC_REUSE(memnode)		\
 	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid());
 	
+#define _STARPU_TRACE_START_FREE(memnode, size)		\
+	FUT_DO_PROBE3(_STARPU_FUT_START_FREE, memnode, _starpu_gettid(), size);
+	
+#define _STARPU_TRACE_END_FREE(memnode)		\
+	FUT_DO_PROBE2(_STARPU_FUT_END_FREE, memnode, _starpu_gettid());
+
+#define _STARPU_TRACE_START_WRITEBACK(memnode)		\
+	FUT_DO_PROBE2(_STARPU_FUT_START_WRITEBACK, memnode, _starpu_gettid());
+	
+#define _STARPU_TRACE_END_WRITEBACK(memnode)		\
+	FUT_DO_PROBE2(_STARPU_FUT_END_WRITEBACK, memnode, _starpu_gettid());
+
 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)		\
 	FUT_DO_PROBE3(_STARPU_FUT_START_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
 	
@@ -612,6 +646,10 @@ do {										\
 #define _STARPU_TRACE_END_ALLOC(memnode)		do {} while(0)
 #define _STARPU_TRACE_START_ALLOC_REUSE(a, size)	do {} while(0)
 #define _STARPU_TRACE_END_ALLOC_REUSE(a)		do {} while(0)
+#define _STARPU_TRACE_START_FREE(memnode, size)	do {} while(0)
+#define _STARPU_TRACE_END_FREE(memnode)		do {} while(0)
+#define _STARPU_TRACE_START_WRITEBACK(memnode)	do {} while(0)
+#define _STARPU_TRACE_END_WRITEBACK(memnode)		do {} while(0)
 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
 #define _STARPU_TRACE_END_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
 #define _STARPU_TRACE_START_PROGRESS(memnode)	do {} while(0)

+ 3 - 6
src/core/dependencies/implicit_data_deps.c

@@ -255,10 +255,9 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					 * number of dependencies. */
 					struct starpu_task *sync_task = starpu_task_create();
 					STARPU_ASSERT(sync_task);
+					sync_task->name = "sync_task_redux";
 					sync_task->cl = NULL;
-#ifdef STARPU_USE_FXT
-					_starpu_get_job_associated_to_task(sync_task)->model_name = "sync_task_redux";
-#endif
+
 					/* Make this task wait for the previous ones */
 					_starpu_add_sync_task(handle, sync_task, sync_task);
 					/* And the requested task wait for this one */
@@ -528,11 +527,9 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 	{
 		struct starpu_task *sync_task, *new_task;
 		sync_task = starpu_task_create();
+		sync_task->name = "sync_task_seq_cons";
 		sync_task->detach = 0;
 		sync_task->destroy = 1;
-#ifdef STARPU_USE_FXT
-		_starpu_get_job_associated_to_task(sync_task)->model_name = "sync_task";
-#endif
 
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */

+ 1 - 5
src/core/jobs.h

@@ -106,7 +106,7 @@ LIST_TYPE(_starpu_job,
 	 * with dot) */
         unsigned exclude_from_dag;
 
-	/* Is that task internal to StarPU ? */
+	/* Is that task internal to StarPU? */
 	unsigned internal;
 
 	/* Each job is attributed a unique id. */
@@ -122,10 +122,6 @@ LIST_TYPE(_starpu_job,
 	 * local variable */
 	struct timespec cl_start;
 
-	/* A symbol name may be associated to the job directly for debug
-	 * purposes (for instance if the codelet is NULL). */
-        const char *model_name;
-
 	struct bound_task *bound_task;
 
 	/* Number of workers executing that task (>1 if the task is parallel)

+ 7 - 6
src/core/perfmodel/perfmodel_history.c

@@ -35,7 +35,6 @@
 #include <windows.h>
 #endif
 
-#define HISTORYMAXERROR	(STARPU_HISTORYMAXERROR > 100 ? 10 : STARPU_HISTORYMAXERROR)
 #define HASH_ADD_UINT32_T(head,field,add) HASH_ADD(hh,head,field,sizeof(uint32_t),add)
 #define HASH_FIND_UINT32_T(head,find,out) HASH_FIND(hh,head,find,sizeof(uint32_t),out)
 
@@ -262,6 +261,7 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 			 * good-enough estimation */
 			STARPU_HG_DISABLE_CHECKING(entry->nsample);
 			STARPU_HG_DISABLE_CHECKING(entry->mean);
+			entry->nerror = 0;
 		}
 
 		scan_history_entry(f, entry);
@@ -625,7 +625,7 @@ static void initialize_model_with_file(FILE*f, struct starpu_perfmodel *model)
 		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 
 		if(ndevice != 0)
-			maxncore = malloc(sizeof((*maxncore)*ndevice));
+			maxncore = malloc(sizeof(*maxncore)*ndevice);
 		else
 			maxncore = NULL;
 
@@ -1067,7 +1067,6 @@ int starpu_perfmodel_list(FILE *output)
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 {
 	model->symbol = strdup(symbol);
-	starpu_perfmodel_init(model);
 
 	/* where is the file if it exists ? */
 	char path[256];
@@ -1142,7 +1141,7 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 
 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned nimpl)
 {
-	snprintf(archname, maxlen, "%s_%dncore_%dimpl_%u",
+	snprintf(archname, maxlen, "%s%d_ncore%d_impl%u",
 			starpu_perfmodel_get_archtype_name(arch->type),
 			arch->devid,
 			arch->ncore,
@@ -1320,9 +1319,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			{
 				/* There is already an entry with the same footprint */
 
-				double local_deviation = (measured/entry->mean)*100;
+				double local_deviation = measured/entry->mean;
 				
-				if (entry->nsample && (local_deviation < (100 - HISTORYMAXERROR) || local_deviation > (100 + HISTORYMAXERROR)))
+				if (entry->nsample &&
+					(100 * local_deviation > (100 + STARPU_HISTORYMAXERROR)
+					 || (100 / local_deviation > (100 + STARPU_HISTORYMAXERROR))))
 				{
 					entry->nerror++;
 

+ 19 - 0
src/core/perfmodel/perfmodel_nan.c

@@ -23,6 +23,24 @@
 #include <string.h>
 #include <config.h>
 #include <core/perfmodel/perfmodel.h>
+#include <ctype.h>
+
+#ifdef STARPU_HAVE_WINDOWS
+static
+void _starpu_read_spaces(FILE *f)
+{
+	int c = getc(f);
+	if (isspace(c))
+	{
+		while (isspace(c)) c = getc(f);
+		ungetc(c, f);
+	}
+	else
+	{
+		ungetc(c, f);
+	}
+}
+#endif /* STARPU_HAVE_WINDOWS */
 
 int _starpu_read_double(FILE *f, char *format, double *val)
 {
@@ -36,6 +54,7 @@ int _starpu_read_double(FILE *f, char *format, double *val)
 	     int x3 = getc(f);
 	     if (x2 == 'a' && x3 == 'n')
 	     {
+		     _starpu_read_spaces(f);
 		     *val = NAN;
 		     return 1;
 	     }

+ 32 - 10
src/core/sched_ctx.c

@@ -847,18 +847,40 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 			return;
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
-		/* FIXME: */
-		/* We also need to check for config->submitting = 0 (i.e. the
-		 * user calle starpu_drivers_request_termination()), in which
-		 * case we need to set config->running to 0 and wake workers,
-		 * so they can terminate, just like
-		 * starpu_drivers_request_termination() does.
-		 *
-		 * Set FIXME to 1 in tests/main/driver_api/run_driver.c to
-		 * check it is actually fixed.
-		 */
 	}
 
+	/* We also need to check for config->submitting = 0 (i.e. the
+	 * user calle starpu_drivers_request_termination()), in which
+	 * case we need to set config->running to 0 and wake workers,
+	 * so they can terminate, just like
+	 * starpu_drivers_request_termination() does.
+	 */
+
+	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
+	if(config->submitting == 0)
+	{
+		STARPU_PTHREAD_RWLOCK_RDLOCK(&changing_ctx_mutex[sched_ctx_id]);
+		if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+		{
+			if(sched_ctx->close_callback)
+				sched_ctx->close_callback(sched_ctx->id, sched_ctx->close_args);
+		}
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
+
+		ANNOTATE_HAPPENS_AFTER(&config->running);
+		config->running = 0;
+		ANNOTATE_HAPPENS_BEFORE(&config->running);
+		int s;
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+		{
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
+				_starpu_check_nsubmitted_tasks_of_sched_ctx(config->sched_ctxs[s].id);
+			}
+		}
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
+
 	return;
 }
 

+ 2 - 1
src/core/sched_policy.c

@@ -473,12 +473,13 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 #endif
 
 	conversion_task = starpu_task_create();
+	conversion_task->name = "conversion_task";
 	conversion_task->synchronous = 0;
 	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 	/* The node does not really matter here */
-	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 #endif
 
 	_starpu_spin_lock(&handle->header_lock);

+ 10 - 7
src/core/task.c

@@ -41,7 +41,6 @@
  * sure that no task remains !) */
 /* TODO we could make this hierarchical to avoid contention ? */
 //static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
-static starpu_pthread_mutex_t submitted_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 
 /* This key stores the task currently handled by the thread, note that we
  * cannot use the worker structure to store that information because it is
@@ -84,6 +83,7 @@ void starpu_task_init(struct starpu_task *task)
 
 	task->dyn_handles = NULL;
 	task->dyn_interfaces = NULL;
+	task->name = NULL;
 }
 
 /* Free all the ressources allocated for a task, without deallocating the task
@@ -804,7 +804,7 @@ starpu_drivers_request_termination(void)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 	int nsubmitted = starpu_task_nsubmitted();
 	config->submitting = 0;
 	if (nsubmitted == 0)
@@ -812,6 +812,7 @@ starpu_drivers_request_termination(void)
 		ANNOTATE_HAPPENS_AFTER(&config->running);
 		config->running = 0;
 		ANNOTATE_HAPPENS_BEFORE(&config->running);
+		STARPU_WMB();
 		int s;
 		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
 		{
@@ -822,7 +823,7 @@ starpu_drivers_request_termination(void)
 		}
 	}
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
 }
 
 int starpu_task_nsubmitted(void)
@@ -1018,16 +1019,16 @@ static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
 	ts.tv_nsec = (timeout % 1000000) * 1000;
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 	while (_starpu_machine_is_running())
 	{
 		int last_nsubmitted = starpu_task_nsubmitted();
 		config->watchdog_ok = 0;
-		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
 
 		_starpu_sleep(ts);
 
-		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+		STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 		if (!config->watchdog_ok && last_nsubmitted
 				&& last_nsubmitted == starpu_task_nsubmitted())
 		{
@@ -1041,12 +1042,14 @@ static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
 				fprintf(stderr,"Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
 		}
 	}
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
 	return NULL;
 }
 
 void _starpu_watchdog_init(void)
 {
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	STARPU_PTHREAD_MUTEX_INIT(&config->submitted_mutex, NULL);
 	STARPU_PTHREAD_CREATE(&watchdog_thread, NULL, watchdog_func, NULL);
 }
 

+ 4 - 4
src/core/workers.c

@@ -280,8 +280,6 @@ int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task,
 int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 	/* TODO: check that the task operand sizes will fit on that device */
-	/* TODO: call application-provided function for various cases like
-	 * double support, shared memory size limit, etc. */
 
 	struct starpu_codelet *cl = task->cl;
 	unsigned nworkers = config.topology.nworkers;
@@ -290,7 +288,8 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 	if (workerid < nworkers)
 	{
 		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
-				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
+				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
+				(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl)));
 	}
 	else
 	{
@@ -311,7 +310,8 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
 			int worker0 = config.combined_workers[workerid - nworkers].combined_workerid[0];
 			return !!((worker_size <= task->cl->max_parallelism) &&
-				_starpu_can_use_nth_implementation(config.workers[worker0].arch, task->cl, nimpl));
+				_starpu_can_use_nth_implementation(config.workers[worker0].arch, task->cl, nimpl) &&
+				(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl)));
 		}
 		else
 		{

+ 2 - 0
src/core/workers.h

@@ -314,6 +314,8 @@ struct _starpu_machine_config
 	unsigned submitting;
 
 	int watchdog_ok;
+
+	starpu_pthread_mutex_t submitted_mutex;
 };
 
 /* Three functions to manage argv, argc */

+ 3 - 0
src/datawizard/copy_driver.c

@@ -495,6 +495,9 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
 		_STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id);
 		ret_copy = copy_data_1_to_1_generic(handle, src_replicate, dst_replicate, req);
+		if (!req)
+			/* Synchronous, this is already finished */
+			_STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id);
 
 		return ret_copy;
 	}

+ 6 - 1
src/datawizard/datastats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,6 +29,7 @@ static unsigned miss_cnt[STARPU_MAXNODES];
 void _starpu_msi_cache_hit(unsigned node STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_ENABLE_STATS
+	STARPU_HG_DISABLE_CHECKING(hit_cnt[node]);
 	hit_cnt[node]++;
 #endif
 }
@@ -36,6 +37,7 @@ void _starpu_msi_cache_hit(unsigned node STARPU_ATTRIBUTE_UNUSED)
 void _starpu_msi_cache_miss(unsigned node STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_ENABLE_STATS
+	STARPU_HG_DISABLE_CHECKING(miss_cnt[node]);
 	miss_cnt[node]++;
 #endif
 }
@@ -81,6 +83,7 @@ static unsigned alloc_cache_hit_cnt[STARPU_MAXNODES];
 void _starpu_allocation_cache_hit(unsigned node STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_ENABLE_STATS
+	STARPU_HG_DISABLE_CHECKING(alloc_cache_hit_cnt[node]);
 	alloc_cache_hit_cnt[node]++;
 #endif
 }
@@ -88,6 +91,7 @@ void _starpu_allocation_cache_hit(unsigned node STARPU_ATTRIBUTE_UNUSED)
 void _starpu_data_allocation_inc_stats(unsigned node STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_ENABLE_STATS
+	STARPU_HG_DISABLE_CHECKING(alloc_cnt[node]);
 	alloc_cnt[node]++;
 #endif
 }
@@ -122,6 +126,7 @@ static size_t comm_amount[STARPU_MAXNODES][STARPU_MAXNODES];
 void _starpu_comm_amounts_inc(unsigned src  STARPU_ATTRIBUTE_UNUSED, unsigned dst  STARPU_ATTRIBUTE_UNUSED, size_t size  STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_ENABLE_STATS
+	STARPU_HG_DISABLE_CHECKING(comm_amount[src][dst]);
 	comm_amount[src][dst] += size;
 #endif /* STARPU_ENABLE_STATS */
 }

+ 2 - 0
src/datawizard/filters.c

@@ -312,6 +312,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 				.nbuffers = 1
 			};
 			struct starpu_task *task = starpu_task_create();
+			task->name = "convert_data";
+
 			STARPU_TASK_SET_HANDLE(task, child_handle, 0);
 			task->cl = &cl;
 			task->synchronous = 1;

+ 8 - 8
src/datawizard/interfaces/bcsr_interface.c

@@ -142,7 +142,7 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b)
 uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->nnz;
 }
@@ -150,7 +150,7 @@ uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->nrow;
 }
@@ -158,7 +158,7 @@ uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->firstentry;
 }
@@ -166,7 +166,7 @@ uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->r;
 }
@@ -174,7 +174,7 @@ uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle)
 uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->c;
 }
@@ -182,7 +182,7 @@ uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle)
 size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle)
 {
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->elemsize;
 }
@@ -204,7 +204,7 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 {
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->colind;
 }
@@ -213,7 +213,7 @@ uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 {
 	/* XXX 0 */
 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return data_interface->rowptr;
 }

+ 6 - 6
src/datawizard/interfaces/block_interface.c

@@ -194,7 +194,7 @@ static void display_block_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_block_interface *block_interface;
 
-	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
 }
@@ -265,7 +265,7 @@ static size_t block_interface_get_size(starpu_data_handle_t handle)
 	size_t size;
 	struct starpu_block_interface *block_interface;
 
-	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, 0);
+	block_interface = (struct starpu_block_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
 
@@ -276,7 +276,7 @@ static size_t block_interface_get_size(starpu_data_handle_t handle)
 uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
 {
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return block_interface->nx;
 }
@@ -284,7 +284,7 @@ uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
 {
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return block_interface->ny;
 }
@@ -292,7 +292,7 @@ uint32_t starpu_block_get_ny(starpu_data_handle_t handle)
 uint32_t starpu_block_get_nz(starpu_data_handle_t handle)
 {
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return block_interface->nz;
 }
@@ -339,7 +339,7 @@ uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle)
 size_t starpu_block_get_elemsize(starpu_data_handle_t handle)
 {
 	struct starpu_block_interface *block_interface = (struct starpu_block_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return block_interface->elemsize;
 }

+ 3 - 3
src/datawizard/interfaces/coo_interface.c

@@ -151,7 +151,7 @@ coo_interface_get_size(starpu_data_handle_t handle)
 {
 	struct starpu_coo_interface *coo_interface;
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return coo_interface->nx * coo_interface->ny * coo_interface->elemsize;
 }
@@ -161,7 +161,7 @@ coo_interface_footprint(starpu_data_handle_t handle)
 {
 	struct starpu_coo_interface *coo_interface;
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return starpu_hash_crc32c_be(coo_interface->nx * coo_interface->ny, 0);
 }
@@ -185,7 +185,7 @@ display_coo_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_coo_interface *coo_interface =
 	coo_interface = (struct starpu_coo_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 }

+ 4 - 4
src/datawizard/interfaces/csr_interface.c

@@ -127,7 +127,7 @@ static int csr_compare(void *data_interface_a, void *data_interface_b)
 uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
 {
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return csr_interface->nnz;
 }
@@ -135,7 +135,7 @@ uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
 uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
 {
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return csr_interface->nrow;
 }
@@ -143,7 +143,7 @@ uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle)
 uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
 {
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return csr_interface->firstentry;
 }
@@ -151,7 +151,7 @@ uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle)
 size_t starpu_csr_get_elemsize(starpu_data_handle_t handle)
 {
 	struct starpu_csr_interface *csr_interface = (struct starpu_csr_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return csr_interface->elemsize;
 }

+ 2 - 2
src/datawizard/interfaces/data_interface.c

@@ -365,7 +365,7 @@ void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
 {
-	void *local_interface = starpu_data_get_interface_on_node(handlesrc, 0);
+	void *local_interface = starpu_data_get_interface_on_node(handlesrc, STARPU_MAIN_RAM);
 	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
 }
 
@@ -620,7 +620,7 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			struct starpu_multiformat_interface *format_interface;
-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 

+ 5 - 5
src/datawizard/interfaces/matrix_interface.c

@@ -195,7 +195,7 @@ static int matrix_compare(void *data_interface_a, void *data_interface_b)
 static void display_matrix_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
 }
@@ -253,7 +253,7 @@ static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void
 static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	size_t size;
 	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize;
@@ -265,7 +265,7 @@ static size_t matrix_interface_get_size(starpu_data_handle_t handle)
 uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return matrix_interface->nx;
 }
@@ -273,7 +273,7 @@ uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
 uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return matrix_interface->ny;
 }
@@ -307,7 +307,7 @@ uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle)
 size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle)
 {
 	struct starpu_matrix_interface *matrix_interface = (struct starpu_matrix_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return matrix_interface->elemsize;
 }

+ 3 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -242,7 +242,7 @@ static void display_multiformat_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_multiformat_interface *multiformat_interface;
 	multiformat_interface = (struct starpu_multiformat_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%u\t", multiformat_interface->nx);
 }
@@ -252,7 +252,7 @@ static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
 	struct starpu_multiformat_interface *multiformat_interface;
-	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	size = multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize;
 	return size;
 }
@@ -260,7 +260,7 @@ static size_t multiformat_interface_get_size(starpu_data_handle_t handle)
 uint32_t starpu_multiformat_get_nx(starpu_data_handle_t handle)
 {
 	struct starpu_multiformat_interface *multiformat_interface;
-	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	multiformat_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	return multiformat_interface->nx;
 }
 

+ 3 - 3
src/datawizard/interfaces/variable_interface.c

@@ -135,7 +135,7 @@ static int variable_compare(void *data_interface_a, void *data_interface_b)
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
 }
@@ -174,7 +174,7 @@ static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, vo
 static size_t variable_interface_get_size(starpu_data_handle_t handle)
 {
 	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return variable_interface->elemsize;
 }
@@ -191,7 +191,7 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle)
 
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
 {
-	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, 0));
+	return STARPU_VARIABLE_GET_ELEMSIZE(starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM));
 }
 
 /* memory allocation/deallocation primitives for the variable interface */

+ 4 - 4
src/datawizard/interfaces/vector_interface.c

@@ -141,7 +141,7 @@ static int vector_compare(void *data_interface_a, void *data_interface_b)
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	fprintf(f, "%u\t", vector_interface->nx);
 }
@@ -181,7 +181,7 @@ static size_t vector_interface_get_size(starpu_data_handle_t handle)
 {
 	size_t size;
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	size = vector_interface->nx*vector_interface->elemsize;
 
@@ -192,7 +192,7 @@ static size_t vector_interface_get_size(starpu_data_handle_t handle)
 uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
 {
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return vector_interface->nx;
 }
@@ -213,7 +213,7 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle)
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle)
 {
 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return vector_interface->elemsize;
 }

+ 6 - 2
src/datawizard/malloc.c

@@ -141,6 +141,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
 			malloc_pinned_cl.where = STARPU_CUDA;
 			struct starpu_task *task = starpu_task_create();
+			task->name = "cuda_malloc_pinned";
 			task->callback_func = NULL;
 			task->cl = &malloc_pinned_cl;
 			task->cl_arg = &s;
@@ -171,6 +172,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 //
 //			malloc_pinned_cl.where = STARPU_OPENCL;
 //			struct starpu_task *task = starpu_task_create();
+//		        task->name = "opencl_malloc_pinned";
 //			task->callback_func = NULL;
 //			task->cl = &malloc_pinned_cl;
 //			task->cl_arg = &s;
@@ -295,6 +297,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
 				free_pinned_cl.where = STARPU_CUDA;
 				struct starpu_task *task = starpu_task_create();
+				task->name = "cuda_free_pinned";
 				task->callback_func = NULL;
 				task->cl = &free_pinned_cl;
 				task->cl_arg = A;
@@ -319,6 +322,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 //
 //                free_pinned_cl.where = STARPU_OPENCL;
 //		struct starpu_task *task = starpu_task_create();
+//              task->name = "opencl_free_pinned";
 //		task->callback_func = NULL;
 //		task->cl = &free_pinned_cl;
 //		task->cl_arg = A;
@@ -501,7 +505,7 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 #ifdef STARPU_SIMGRID
 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
 			/* Sleep for the free */
-			MSG_process_sleep(0.000125);
+			MSG_process_sleep(0.000750);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 			cudaError_t err;
@@ -518,7 +522,7 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 #ifdef STARPU_SIMGRID
 			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 			/* Sleep for the free */
-			MSG_process_sleep(0.000125);
+			MSG_process_sleep(0.000750);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 #else
 			cl_int err;

+ 9 - 1
src/datawizard/memalloc.c

@@ -273,7 +273,9 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 		}
 #endif
 
+		_STARPU_TRACE_START_FREE(node, mc->size);
 		mc->ops->free_data_on_node(mc->chunk_interface, node);
+		_STARPU_TRACE_END_FREE(node);
 
 		if (handle)
 			notify_handle_children(handle, replicate, node);
@@ -381,7 +383,9 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 				if (handle->per_node[node].state == STARPU_OWNER)
 					_starpu_memory_handle_stats_invalidated(handle, node);
 #endif
+				_STARPU_TRACE_START_WRITEBACK(node);
 				transfer_subtree_to_node(handle, node, target);
+				_STARPU_TRACE_END_WRITEBACK(node);
 #ifdef STARPU_MEMORY_STATS
 				_starpu_memory_handle_stats_loaded_owner(handle, target);
 #endif
@@ -453,7 +457,9 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
 			/* in case there was nobody using that buffer, throw it
 			 * away after writing it back to main memory */
+			_STARPU_TRACE_START_WRITEBACK(node);
 			transfer_subtree_to_node(old_data, node, 0);
+			_STARPU_TRACE_END_WRITEBACK(node);
 
 			/* now replace the previous data */
 			reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
@@ -670,7 +676,7 @@ size_t _starpu_memory_reclaim_generic(unsigned node, unsigned force, size_t recl
 	{
 		static int warned;
 		if (!warned) {
-			_STARPU_DISP("Not enough memory left on node %u. Trying to purge %lu bytes out\n", node, (unsigned long) reclaim);
+			_STARPU_DISP("Not enough memory left on node %u. Trying to purge %lu bytes out. This message will not be printed again for further purges\n", node, (unsigned long) reclaim);
 			warned = 1;
 		}
 	}
@@ -914,7 +920,9 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	if (replicate->allocated)
 	{
 		/* Argl, somebody allocated it in between already, drop this one */
+		_STARPU_TRACE_START_FREE(dst_node, data_size);
 		handle->ops->free_data_on_node(data_interface, dst_node);
+		_STARPU_TRACE_END_FREE(dst_node);
 		allocated_memory = 0;
 	}
 	else

+ 3 - 0
src/datawizard/reduction.c

@@ -233,6 +233,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					/* Perform the reduction between replicates i
 					 * and i+step and put the result in replicate i */
 					struct starpu_task *redux_task = starpu_task_create();
+					redux_task->name = "redux_task_between_replicates";
 
 					/* Mark these tasks so that StarPU does not block them
 					 * when they try to access the handle (normal tasks are
@@ -294,6 +295,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 		if (empty)
 		{
 			struct starpu_task *redux_task = starpu_task_create();
+			redux_task->name = "redux_task_empty";
 
 			/* Mark these tasks so that StarPU does not block them
 			 * when they try to access the handle (normal tasks are
@@ -320,6 +322,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 		for (replicate = 0; replicate < replicate_count; replicate++)
 		{
 			struct starpu_task *redux_task = starpu_task_create();
+			redux_task->name = "redux_task_reduction";
 
 			/* Mark these tasks so that StarPU does not block them
 			 * when they try to access the handle (normal tasks are

+ 6 - 16
src/datawizard/user_interactions.c

@@ -143,20 +143,15 @@ int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t h
 	{
 		struct starpu_task *new_task;
 		wrapper->pre_sync_task = starpu_task_create();
+		wrapper->pre_sync_task->name = "acquire_cb_pre";
 		wrapper->pre_sync_task->detach = 1;
 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
 		wrapper->pre_sync_task->callback_arg = wrapper;
 
 		wrapper->post_sync_task = starpu_task_create();
+		wrapper->post_sync_task->name = "acquire_cb_post";
 		wrapper->post_sync_task->detach = 1;
 
-#ifdef STARPU_USE_FXT
-                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
-                job->model_name = "acquire_cb_pre";
-                job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
-                job->model_name = "acquire_cb_post";
-#endif
-
 		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
@@ -197,7 +192,7 @@ int starpu_data_acquire_cb(starpu_data_handle_t handle,
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 {
-	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, 0, mode, callback, arg, sequential_consistency);
+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
 }
 
 /*
@@ -265,18 +260,13 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 	{
 		struct starpu_task *new_task;
 		wrapper.pre_sync_task = starpu_task_create();
+		wrapper.pre_sync_task->name = "acquire_pre";
 		wrapper.pre_sync_task->detach = 0;
 
 		wrapper.post_sync_task = starpu_task_create();
+		wrapper.post_sync_task->name = "acquire_post";
 		wrapper.post_sync_task->detach = 1;
 
-#ifdef STARPU_USE_FXT
-                struct _starpu_job *job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
-                job->model_name = "acquire_pre";
-                job = _starpu_get_job_associated_to_task(wrapper.post_sync_task);
-                job->model_name = "acquire_post";
-#endif
-
 		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
@@ -327,7 +317,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
-	return starpu_data_acquire_on_node(handle, 0, mode);
+	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
 }
 
 /* This function must be called after starpu_data_acquire so that the

+ 65 - 14
src/debug/traces/starpu_fxt.c

@@ -101,7 +101,8 @@ static unsigned get_colour_symbol_blue(char *name)
 }
 
 static double last_codelet_start[STARPU_NMAXWORKERS];
-static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
+/* _STARPU_FUT_DO_PROBE4STR records only 4 longs */
+static char last_codelet_symbol[STARPU_NMAXWORKERS][4*sizeof(unsigned long)];
 
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -109,6 +110,7 @@ static char last_codelet_symbol[STARPU_NMAXWORKERS][128];
 static double last_activity_flush_timestamp[STARPU_NMAXWORKERS];
 static double accumulated_sleep_time[STARPU_NMAXWORKERS];
 static double accumulated_exec_time[STARPU_NMAXWORKERS];
+static double reclaiming[STARPU_MAXNODES];
 
 LIST_TYPE(_starpu_symbol_name,
 	char *name;
@@ -514,9 +516,9 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 	{
 #ifdef STARPU_HAVE_POTI
 		create_paje_state_color(name, "S", red, green, blue);
-		create_paje_state_color(name, "Ctx1", 255.0, 255.0, 0.0);
+		create_paje_state_color(name, "Ctx1", 255.0, 102.0, 255.0);
 		create_paje_state_color(name, "Ctx2", .0, 255.0, 0.0);
-		create_paje_state_color(name, "Ctx3", 75.0, .0, 130.0);
+		create_paje_state_color(name, "Ctx3", 255.0, 255.0, .0);
 		create_paje_state_color(name, "Ctx4", .0, 245.0, 255.0);
 		create_paje_state_color(name, "Ctx5", .0, .0, .0);
 		create_paje_state_color(name, "Ctx6", .0, .0, 128.0);
@@ -526,9 +528,9 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 		create_paje_state_color(name, "Ctx10", 154.0, 205.0, 50.0);
 #else
 		fprintf(out_paje_file, "6	%s	S	%s	\"%f %f %f\" \n", name, name, red, green, blue);
-		fprintf(out_paje_file, "6	%s	Ctx1	%s	\"255.0 255.0 0.0\" \n", name, name);
+		fprintf(out_paje_file, "6	%s	Ctx1	%s	\"255.0 102.0 255.0\" \n", name, name);
 		fprintf(out_paje_file, "6	%s	Ctx2	%s	\".0 255.0 .0\" \n", name, name);
-		fprintf(out_paje_file, "6	%s	Ctx3	%s	\"75.0 .0 130.0\" \n", name, name);
+		fprintf(out_paje_file, "6	%s	Ctx3	%s	\"225.0 225.0 .0\" \n", name, name);
 		fprintf(out_paje_file, "6	%s	Ctx4	%s	\".0 245.0 255.0\" \n", name, name);
 		fprintf(out_paje_file, "6	%s	Ctx5	%s	\".0 .0 .0\" \n", name, name);
 		fprintf(out_paje_file, "6	%s	Ctx6	%s	\".0 .0 128.0\" \n", name, name);
@@ -587,7 +589,7 @@ static struct starpu_fxt_codelet_event *dumped_codelets;
 static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
-	worker = find_worker_id(ev->param[4]);
+	worker = find_worker_id(ev->param[6]);
 	if (worker < 0) return;
 
 	char *prefix = options->file_prefix;
@@ -598,7 +600,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	uint32_t codelet_hash = ev->param[2];
 
 	if (out_paje_file)
-		worker_set_state(end_codelet_time, prefix, ev->param[4], "B");
+		worker_set_state(end_codelet_time, prefix, ev->param[6], "B");
 
 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
 
@@ -610,14 +612,14 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
 	if (options->dumped_codelets)
 	{
-		struct starpu_perfmodel_arch* arch = ev->param[3];
-
 		dumped_codelets_count++;
 		dumped_codelets = realloc(dumped_codelets, dumped_codelets_count*sizeof(struct starpu_fxt_codelet_event));
 
 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, 256, "%s", last_codelet_symbol[worker]);
 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
-		dumped_codelets[dumped_codelets_count - 1].arch = *arch;
+		dumped_codelets[dumped_codelets_count - 1].arch.type = ev->param[3];
+		dumped_codelets[dumped_codelets_count - 1].arch.devid = ev->param[4];
+		dumped_codelets[dumped_codelets_count - 1].arch.ncore = ev->param[5];
 
 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
@@ -1466,15 +1468,64 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_memnode_event(&ev, options, "Ar");
 				break;
 
+			case _STARPU_FUT_END_ALLOC:
+			case _STARPU_FUT_END_ALLOC_REUSE:
+				if (!options->no_bus)
+				handle_memnode_event(&ev, options, "No");
+				break;
+
+			case _STARPU_FUT_START_FREE:
+				if (!options->no_bus)
+				{
+					handle_memnode_event(&ev, options, "F");
+				}
+				break;
+
+			case _STARPU_FUT_END_FREE:
+				if (!options->no_bus)
+				{
+					unsigned memnode = ev.param[0];
+					if (reclaiming[memnode])
+						handle_memnode_event(&ev, options, "R");
+					else
+						handle_memnode_event(&ev, options, "No");
+				}
+				break;
+
+			case _STARPU_FUT_START_WRITEBACK:
+				if (!options->no_bus)
+				{
+					handle_memnode_event(&ev, options, "W");
+				}
+				break;
+
+			case _STARPU_FUT_END_WRITEBACK:
+				if (!options->no_bus)
+				{
+					unsigned memnode = ev.param[0];
+					if (reclaiming[memnode])
+						handle_memnode_event(&ev, options, "R");
+					else
+						handle_memnode_event(&ev, options, "No");
+				}
+				break;
+
 			case _STARPU_FUT_START_MEMRECLAIM:
-				handle_memnode_event(&ev, options, "R");
+				if (!options->no_bus)
+				{
+					unsigned memnode = ev.param[0];
+					reclaiming[memnode] = 1;
+					handle_memnode_event(&ev, options, "R");
+				}
 				break;
 
-			case _STARPU_FUT_END_ALLOC:
-			case _STARPU_FUT_END_ALLOC_REUSE:
 			case _STARPU_FUT_END_MEMRECLAIM:
 				if (!options->no_bus)
-				handle_memnode_event(&ev, options, "No");
+				{
+					unsigned memnode = ev.param[0];
+					reclaiming[memnode] = 0;
+					handle_memnode_event(&ev, options, "No");
+				}
 				break;
 
 			case _STARPU_FUT_USER_EVENT:

+ 6 - 2
src/debug/traces/starpu_paje.c

@@ -147,7 +147,9 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineStateType("MS", "Mm", "Memory Node State");
 	poti_DefineEntityValue("A", "MS", "Allocating", ".4 .1 .0");
 	poti_DefineEntityValue("Ar", "MS", "AllocatingReuse", ".1 .1 .8");
-	poti_DefineEntityValue("R", "MS", "Reclaiming", ".0 .1 .4");
+	poti_DefineEntityValue("F", "MS", "Freeing", ".6 .3 .0");
+	poti_DefineEntityValue("W", "MS", "WritingBack", ".0 .0 .4");
+	poti_DefineEntityValue("R", "MS", "Reclaiming", ".0 .1 .6");
 	poti_DefineEntityValue("Co", "MS", "DriverCopy", ".3 .5 .1");
 	poti_DefineEntityValue("CoA", "MS", "DriverCopyAsync", ".1 .3 .1");
 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
@@ -256,7 +258,9 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "\
 6       A       MS      Allocating         \".4 .1 .0\"		\n\
 6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n\
-6       R       MS      Reclaiming         \".0 .1 .4\"		\n\
+6       F       MS      Freeing         \".6 .3 .0\"		\n\
+6       W       MS      WritingBack         \".0 .0 .4\"		\n\
+6       R       MS      Reclaiming         \".0 .1 .6\"		\n\
 6       Co       MS     DriverCopy         \".3 .5 .1\"		\n\
 6       CoA      MS     DriverCopyAsync         \".1 .3 .1\"		\n\
 6       No       MS     Nothing         \".0 .0 .0\"		\n\

+ 1 - 0
src/sched_policies/fifo_queues.c

@@ -49,6 +49,7 @@ struct _starpu_fifo_taskq *_starpu_create_fifo(void)
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	starpu_task_list_init(&fifo->taskq);
 	fifo->ntasks = 0;
+	STARPU_HG_DISABLE_CHECKING(fifo->ntasks);
 	fifo->nprocessed = 0;
 
 	fifo->exp_start = starpu_timing_now();

+ 2 - 8
src/util/execute_on_all.c

@@ -64,6 +64,7 @@ void starpu_execute_on_specific_workers(void (*func)(void*), void * arg, unsigne
 	{
 		unsigned worker = workers[w];
 		tasks[w] = starpu_task_create();
+		tasks[w]->name = name;
 
 		tasks[w]->cl = &wrapper_cl;
 		tasks[w]->cl_arg = &args;
@@ -74,10 +75,6 @@ void starpu_execute_on_specific_workers(void (*func)(void*), void * arg, unsigne
 		tasks[w]->detach = 0;
 		tasks[w]->destroy = 0;
 
-#ifdef STARPU_USE_FXT
-		_starpu_get_job_associated_to_task(tasks[w])->model_name = name;
-#endif
-
 		_starpu_exclude_task_from_dag(tasks[w]);
 
 		ret = starpu_task_submit(tasks[w]);
@@ -134,6 +131,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		tasks[worker] = starpu_task_create();
+		tasks[worker]->name = wrapper_cl.name;
 
 		tasks[worker]->cl = &wrapper_cl;
 		tasks[worker]->cl_arg = &args;
@@ -144,10 +142,6 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 		tasks[worker]->detach = 0;
 		tasks[worker]->destroy = 0;
 
-#ifdef STARPU_USE_FXT
-		_starpu_get_job_associated_to_task(tasks[worker])->model_name = wrapper_cl.name;
-#endif
-
 		_starpu_exclude_task_from_dag(tasks[worker]);
 
 		ret = _starpu_task_submit_internally(tasks[worker]);

+ 4 - 4
src/util/misc.c

@@ -38,11 +38,11 @@ const char *_starpu_job_get_model_name(struct _starpu_job *j)
 
 	struct starpu_task *task = j->task;
 	if (task)
+	{
 		ret = _starpu_codelet_get_model_name(task->cl);
+		if (!ret)
+			ret = task->name;
+	}
 
-#ifdef STARPU_USE_FXT
-	if (!ret)
-		ret = j->model_name;
-#endif
 	return ret;
 }

+ 2 - 1
src/util/starpu_create_sync_task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,6 +30,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
 	/* We create an empty task */
 	struct starpu_task *sync_task = starpu_task_create();
+	sync_task->name = "create_sync_task";
 
 	sync_task->use_tag = 1;
 	sync_task->tag_id = sync_tag;

+ 1 - 0
src/util/starpu_data_cpy.c

@@ -141,6 +141,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 
 	struct starpu_task *task = starpu_task_create();
 	STARPU_ASSERT(task);
+	task->name = "data_cpy";
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	if (reduction)

+ 2 - 0
src/util/starpu_task_insert.c

@@ -86,6 +86,7 @@ int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 	}
 
 	struct starpu_task *task = starpu_task_create();
+	task->name = "task_insert";
 	task->cl_arg_free = 1;
 
 	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
@@ -146,6 +147,7 @@ struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
 	}
 
 	struct starpu_task *task = starpu_task_create();
+	task->name = "task_build";
 
 	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
 	{

+ 1 - 0
tests/Makefile.am

@@ -222,6 +222,7 @@ noinst_PROGRAMS =				\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
 	parallel_tasks/spmd_peager		\
+	parallel_tasks/cuda_only		\
 	perfmodels/regression_based		\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/feed				\

+ 3 - 11
tests/main/driver_api/run_driver.c

@@ -32,11 +32,6 @@
  * - STARPU_TEST_SKIPPED (non-critical errors)
  */
 
-/* See FIXME in src/core/sched_ctx.c about starpu_drivers_request_termination.
- * This test should really use non-synchronous tasks, to properly cover all
- * needed cases. */
-#define FIXME 0
-
 #if defined(STARPU_USE_CPU) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 static void
 dummy(void *buffers[], void *args)
@@ -100,9 +95,8 @@ test_cpu(void)
 	cl.where = STARPU_CPU;
 	task->cl = &cl;
 	task->cl_arg = &var;
-#if !FIXME
 	task->synchronous = 1;
-#endif
+
 	ret = starpu_task_submit(task);
 	if (ret == -ENODEV)
 	{
@@ -159,9 +153,8 @@ test_cuda(void)
 	cl.where = STARPU_CUDA;
 	task->cl = &cl;
 	task->cl_arg = &var;
-#if !FIXME
 	task->synchronous = 1;
-#endif
+
 	ret = starpu_task_submit(task);
 	if (ret == -ENODEV)
 	{
@@ -244,9 +237,8 @@ test_opencl(void)
 	cl.where = STARPU_OPENCL;
 	task->cl = &cl;
 	task->cl_arg = &var;
-#if !FIXME
 	task->synchronous = 1;
-#endif
+
 	ret = starpu_task_submit(task);
 	if (ret == -ENODEV)
 	{

+ 1 - 1
tests/microbenchs/prefetch_data_on_node.c

@@ -113,7 +113,7 @@ int main(int argc, char **argv)
 		{
 			/* synchronous prefetch */
 			unsigned node = starpu_worker_get_memory_node(worker);
-			ret = starpu_data_prefetch_on_node(v_handle, node, STARPU_MAIN_RAM);
+			ret = starpu_data_prefetch_on_node(v_handle, node, 0);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_prefetch_on_node");
 
 			/* execute a task */

+ 110 - 0
tests/parallel_tasks/cuda_only.c

@@ -0,0 +1,110 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <starpu.h>
+#include <limits.h>
+#include <unistd.h>
+#include "../helper.h"
+
+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "test"
+};
+
+static struct starpu_codelet cl =
+{
+	.cuda_funcs = {codelet_null, NULL},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+struct starpu_perfmodel model2 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "test2"
+};
+
+static struct starpu_codelet cl2 =
+{
+	.cuda_funcs = {codelet_null, NULL},
+	.model = &model2,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+
+int main(int argc, char **argv)
+{
+	int ret;
+	starpu_data_handle_t handle;
+	unsigned data;
+
+        struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.sched_policy_name = "pheft";
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&data, sizeof(data));
+
+	unsigned iter;
+	struct starpu_task *task;
+	for (iter = 0; iter < 100; iter++)
+	{
+		task = starpu_task_create();
+		task->cl = &cl;
+		task->handles[0] = handle;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		task = starpu_task_create();
+		task->cl = &cl2;
+		task->handles[0] = handle;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+	STARPU_RETURN(EXIT_SUCCESS);
+
+enodev:
+	task->destroy = 0;
+	starpu_task_destroy(task);
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	STARPU_RETURN(STARPU_TEST_SKIPPED);
+}

+ 9 - 8
tests/perfmodels/value_nan.c

@@ -31,21 +31,22 @@ int _check_number(double val, int checknan)
 {
 	char *tmp = "starpu_XXXXXX";
 	char filename[100];
-	int id;
 
 	strcpy(filename, tmp);
+
 #ifdef STARPU_HAVE_WINDOWS
         _mktemp(filename);
-        id = open(filename, _O_RDWR);
 #else
-	id = mkstemp(filename);
-
-#endif
-	/* fail */
-	if (id < 0)
 	{
-		return 1;
+	     int id = mkstemp(filename);
+	     /* fail */
+	     if (id < 0)
+	     {
+		  FPRINTF(stderr, "Error when creating temp file\n");
+		  return 1;
+	     }
 	}
+#endif
 
 	/* write the double value in the file followed by a predefined string */
 	FILE *f = fopen(filename, "w");

+ 14 - 6
tools/gdbinit

@@ -30,10 +30,8 @@ define starpu-print-job
     printf "\tsubmitted:\t\t\t<%d>\n", $job->submitted
     printf "\tterminated:\t\t\t<%d>\n", $job->terminated
     printf "\tjob_id:\t\t\t\t<%d>\n", $job->job_id
-    if _starpu_use_fxt == 1
-      if $job->model_name
-        printf "\tmodel_name:\t\t\t<%s>\n", $job->model_name
-      end
+    if $job->task
+        printf "\tname:\t\t\t\t<%s>\n", $job->task->name
     end
   end
 end
@@ -73,6 +71,7 @@ define starpu-print-task
   end
 
   printf "StarPU Task (%p)\n", $task
+  printf "\tname:\t\t\t\t<%s>\n", $task->name
   printf "\tcodelet:\t\t\t<%p>\n", $task->cl
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
@@ -137,8 +136,17 @@ define starpu-tags
 end
 
 define starpu-tasks
-  printf "%d submitted tasks\n", nsubmitted
-  printf "%d ready tasks\n", nready
+  set language c
+  set $num=0
+  set $nsubmitted=0
+  set $nready=0
+  while $num<config->topology->nsched_ctxs
+  	set $nsubmitted = $nsubmitted + config->sched_ctxs[$num]->tasks_barrier->barrier->reached_start 
+  	set $nready = $nready + config->sched_ctxs[$num]->ready_tasks_barrier->barrier->reached_start
+	set $num = $num + 1
+  end
+  printf "%d submitted tasks\n", $nsubmitted
+  printf "%d ready tasks\n", $nready
   printf "Tasks being run:\n"
   set $n = 0
   while $n < config.topology.nworkers

+ 0 - 75
tools/model.sh

@@ -1,75 +0,0 @@
-#!/bin/bash
-
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-# 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
-# 
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-# 
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-# 
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-
-trace_model()
-{
-	inputfile=$1
-	
-	cpuentries=`head -1 $inputfile`
-	gpuentries=`head -2 $inputfile|tail -1`
-	
-	cpumodel=`head -3 $inputfile|tail -1`
-	gpumodel=`head -4 $inputfile|tail -1`
-	
-	a_cpu=`cut -f 1 $inputfile| head -5|tail -1`
-	b_cpu=`cut -f 2 $inputfile| head -5|tail -1`
-	c_cpu=`cut -f 3 $inputfile| head -5|tail -1`
-	
-	a_gpu=`cut -f 1 $inputfile| head -6|tail -1`
-	b_gpu=`cut -f 2 $inputfile| head -6|tail -1`
-	c_gpu=`cut -f 3 $inputfile| head -6|tail -1`
-
-	alpha_cpu=`cut -f 5 $inputfile| head -3|tail -1` 
-	alpha_gpu=`cut -f 5 $inputfile| head -4|tail -1` 
-	
-	beta_cpu=`cut -f 6 $inputfile| head -3|tail -1` 
-	beta_gpu=`cut -f 6 $inputfile| head -4|tail -1` 
-	
-	tail -$(($gpuentries + $cpuentries)) $inputfile | head -$(($cpuentries)) |cut -f 2-4 > $inputfile.cpu
-	tail -$(($gpuentries)) $inputfile | cut -f 2-4> $inputfile.gpu
-	
-	echo "pouet $cpuentries gpu $gpuentries toot"
-	
-	echo "cpumodel $alpha_cpu * size ^ $beta_cpu"
-	echo "gpumodel $alpha_gpu * size ^ $beta_gpu"
-	
-	gpfile=$inputfile.gp
-	
-	echo "#!/usr/bin/gnuplot -persist" 		> $gpfile
-	echo "set term postscript eps enhanced color" 	>> $gpfile
-	echo "set logscale x"				>> $gpfile 
-	echo "set logscale y"				>> $gpfile 
-	echo "set key left top"				>> $gpfile 
-	echo "set title \"$inputfile\""			>> $gpfile 
-	echo "set output \"$inputfile.eps\""		>> $gpfile
-	
-	echo  "plot	$alpha_gpu*x**$beta_gpu title \"GPU regression\" ,\\" >> $gpfile
-	echo  "	\"$inputfile.gpu\" with errorbar title \"GPU measured\" ,\\" >> $gpfile
-	echo  "	$c_gpu + exp(log($a_gpu) + $b_gpu * log(x) ) title \"GPU regression (non linear)\" ,\\" >> $gpfile
-	echo  "	\"$inputfile.cpu\" with errorbar title \"CPU measured\" ,\\" >> $gpfile
-	echo  "	$alpha_cpu*x**$beta_cpu title \"CPU regression\" ,\\" >> $gpfile
-	echo  "	$c_cpu + exp(log($a_cpu) + $b_cpu * log(x) ) title \"CPU regression (non linear)\"" >> $gpfile
-	
-	gnuplot $gpfile
-}
-
-for file in $@
-do
-	trace_model "$file"
-done

+ 1 - 1
tools/starpu_codelet_histo_profile.in

@@ -64,7 +64,7 @@ size <- unique(mytable[,3])
 
 pdf(paste("$inputfile", codelet, arch, hash, size, "pdf", sep="."));
 
-h <- hist(val[val > quantile(val,0.01) & val<quantile(val,0.99)], col="red", breaks=50, density=10)
+try ( { h <- hist(val[val > quantile(val,0.01) & val<quantile(val,0.99)], col="red", breaks=50, density=10) } )
 
 dev.off()
 

+ 1 - 1
tools/starpu_fxt_stats.c

@@ -136,7 +136,7 @@ int main(int argc, char **argv)
 	else
 	{
 		fd_out = fopen(fout, "w");
-		if (fd_out < 0)
+		if (fd_out == NULL)
 		{
 			perror("open failed :");
 			exit(-1);

+ 2 - 2
tools/starpu_perfmodel_display.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
 	}
 	else
 	{
-		struct starpu_perfmodel model;
+		struct starpu_perfmodel model = {};
                 int ret = starpu_perfmodel_load_symbol(psymbol, &model);
                 if (ret == 1)
 		{

+ 5 - 4
tools/starpu_perfmodel_plot.c

@@ -450,15 +450,16 @@ static void init_archtype_is_found(struct starpu_perfmodel *model)
 	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
 	{
 	
-		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++);
+		for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
+			;
 		ndevice = devid;
 		if(ndevice != 0)
 		{
 			maxncore = malloc(sizeof(*maxncore)*ndevice);
-			for(devid=0; devid < ndevice; devid++);
+			for(devid=0; devid < ndevice; devid++)
 			{
-			
-				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++);
+				for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
+					;
 				maxncore[devid] = ncore;
 			}
 		}

+ 0 - 60
tools/valgrind/fscanf.suppr

@@ -1,60 +0,0 @@
-{
-   fscanf error
-   Memcheck:Cond
-   fun:__GI___strncasecmp_l
-   fun:____strtod_l_internal
-   fun:_IO_vfscanf
-}
-
-{
-   fprintf cond error
-   Memcheck:Cond
-   ...
-   fun:vfprintf
-   fun:buffered_vfprintf
-   fun:vfprintf
-   fun:fprintf
-}
-
-{
-   fprintf value error
-   Memcheck:Value8
-   ...
-   fun:vfprintf
-   fun:buffered_vfprintf
-   fun:vfprintf
-   fun:fprintf
-}
-
-{
-   fprintf addr4 error
-   Memcheck:Addr4
-   ...
-   fun:vfprintf
-   fun:buffered_vfprintf
-   fun:vfprintf
-   fun:fprintf
-   fun:main
-}
-
-{
-   fprintf addr8 error
-   Memcheck:Addr8
-   ...
-   fun:vfprintf
-   fun:buffered_vfprintf
-   fun:vfprintf
-   fun:fprintf
-   fun:main
-}
-
-{
-   fprintf addr1 error
-   Memcheck:Addr1
-   ...
-   fun:vfprintf
-   fun:buffered_vfprintf
-   fun:vfprintf
-   fun:fprintf
-   fun:main
-}

+ 8 - 0
tools/valgrind/pthread.suppr

@@ -13,3 +13,11 @@
    fun:pthread_cancel_init
    ...
 }
+
+{
+   Helgrind 3.9 wrongly compares to-be-destroyed mutex with init value
+   Helgrind:Race
+   fun:my_memcmp
+   fun:pthread_mutex_destroy
+   ...
+}