Browse Source

merge trunk

Samuel Thibault 11 years ago
parent
commit
9de7c76db8
91 changed files with 2177 additions and 743 deletions
  1. 19 1
      ChangeLog
  2. 22 4
      configure.ac
  3. 2 0
      doc/doxygen/Makefile.am
  4. 9 0
      doc/doxygen/chapters/01building.doxy
  5. 4 4
      doc/doxygen/chapters/02basic_examples.doxy
  6. 1 1
      doc/doxygen/chapters/07data_management.doxy
  7. 4 0
      doc/doxygen/chapters/08scheduling.doxy
  8. 10 0
      doc/doxygen/chapters/13offline_performance_tools.doxy
  9. 5 5
      doc/doxygen/chapters/18mic_scc_support.doxy
  10. 9 0
      doc/doxygen/chapters/40environment_variables.doxy
  11. 63 7
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  12. 26 0
      doc/doxygen/chapters/api/data_interfaces.doxy
  13. 6 0
      doc/doxygen/chapters/api/data_management.doxy
  14. 0 4
      doc/doxygen/chapters/api/performance_model.doxy
  15. 5 1
      doc/doxygen/chapters/api/workers.doxy
  16. 1 1
      doc/doxygen/doxygen-config.cfg.in
  17. 1 49
      doc/doxygen/refman.tex
  18. 16 2
      doc/tutorial/Makefile
  19. 117 0
      doc/tutorial/vector_scal_task_insert.c
  20. 39 94
      examples/Makefile.am
  21. 0 1
      examples/callback/prologue.c
  22. 10 1
      examples/openmp/vector_scal_omp.c
  23. 5 1
      examples/scheduler/schedulers.sh
  24. 2 0
      include/starpu_config.h.in
  25. 8 2
      include/starpu_data_interfaces.h
  26. 0 2
      include/starpu_perfmodel.h
  27. 2 0
      include/starpu_sched_ctx.h
  28. 25 9
      include/starpu_task.h
  29. 3 1
      include/starpu_thread.h
  30. 6 0
      include/starpu_util.h
  31. 3 0
      include/starpu_worker.h
  32. 13 0
      m4/acinclude.m4
  33. 1 1
      mpi/examples/Makefile.am
  34. 1 1
      mpi/examples/mpi_lu/plu_outofcore_example.c
  35. 2 2
      mpi/examples/mpi_lu/pxlu_implicit.c
  36. 3 2
      mpi/examples/stencil/stencil5.c
  37. 1 1
      mpi/src/starpu_mpi_task_insert.c
  38. 36 8
      src/common/fxt.h
  39. 92 8
      src/common/thread.c
  40. 6 6
      src/core/dependencies/data_concurrency.c
  41. 67 80
      src/core/dependencies/implicit_data_deps.c
  42. 2 2
      src/core/dependencies/implicit_data_deps.h
  43. 25 14
      src/core/jobs.c
  44. 16 6
      src/core/jobs.h
  45. 14 30
      src/core/perfmodel/perfmodel.c
  46. 3 2
      src/core/perfmodel/perfmodel_history.c
  47. 5 5
      src/core/perfmodel/perfmodel_print.c
  48. 32 11
      src/core/sched_ctx.c
  49. 6 4
      src/core/sched_policy.c
  50. 25 33
      src/core/task.c
  51. 11 1
      src/core/workers.c
  52. 7 7
      src/datawizard/coherency.c
  53. 8 1
      src/datawizard/coherency.h
  54. 12 6
      src/datawizard/data_request.c
  55. 3 1
      src/datawizard/filters.c
  56. 2 1
      src/datawizard/footprint.c
  57. 16 3
      src/datawizard/interfaces/bcsr_interface.c
  58. 27 3
      src/datawizard/interfaces/block_interface.c
  59. 12 1
      src/datawizard/interfaces/coo_interface.c
  60. 13 2
      src/datawizard/interfaces/csr_interface.c
  61. 31 3
      src/datawizard/interfaces/data_interface.c
  62. 25 3
      src/datawizard/interfaces/matrix_interface.c
  63. 21 3
      src/datawizard/interfaces/variable_interface.c
  64. 22 3
      src/datawizard/interfaces/vector_interface.c
  65. 9 2
      src/datawizard/interfaces/void_interface.c
  66. 3 1
      src/datawizard/memalloc.c
  67. 1 0
      src/datawizard/memory_nodes.c
  68. 4 3
      src/datawizard/user_interactions.c
  69. 128 51
      src/debug/traces/starpu_fxt.c
  70. 10 1
      src/debug/traces/starpu_paje.c
  71. 2 0
      src/drivers/cpu/driver_cpu.c
  72. 21 2
      src/drivers/cuda/driver_cuda.c
  73. 11 10
      src/drivers/driver_common/driver_common.c
  74. 3 3
      src/drivers/gordon/driver_gordon.c
  75. 1 1
      src/drivers/mp_common/source_common.c
  76. 6 2
      src/drivers/opencl/driver_opencl.c
  77. 2 2
      src/sched_policies/deque_modeling_policy_data_aware.c
  78. 6 6
      src/sched_policies/locality_work_stealing_policy.c
  79. 2 4
      src/sched_policies/work_stealing_policy.c
  80. 10 5
      src/util/starpu_task_insert.c
  81. 17 6
      src/util/starpu_task_insert_utils.c
  82. 1 1
      src/util/starpu_task_insert_utils.h
  83. 2 2
      src/worker_collection/worker_list.c
  84. 14 2
      tests/Makefile.am
  85. 293 0
      tests/datawizard/gpu_ptr_register.c
  86. 230 0
      tests/datawizard/variable_parameters.c
  87. 22 14
      tests/loader.c
  88. 0 178
      tests/main/deprecated_buffer.c
  89. 3 3
      tests/main/restart.c
  90. 284 0
      tools/starpu_paje_summary.Rmd
  91. 109 0
      tools/starpu_paje_summary.in

+ 19 - 1
ChangeLog

@@ -18,7 +18,7 @@ StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 
 New features:
-  * Xeon Phi support
+  * MIC Xeon Phi support
   * SCC support
   * New function starpu_sched_ctx_exec_parallel_code to execute a
     parallel code on the workers of the given scheduler context
@@ -52,6 +52,9 @@ New features:
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
   * New locality work stealing scheduler (lws).
+  * Add STARPU_VARIABLE_NBUFFERS to be set in cl.nbuffers, and nbuffers and
+    modes field to the task structure, which permit to define codelets taking a
+    variable number of data.
 
 Small features:
   * Tasks can now have a name (via the field const char *name of
@@ -75,6 +78,9 @@ Small features:
   * Add codelet size, footprint and tag id in the paje trace.
   * Add STARPU_TAG_ONLY, to specify a tag for traces without making StarPU
     manage the tag.
+  * On Linux x86, spinlocks now block after a hundred tries. This avoids
+    typical 10ms pauses when the application thread tries to submit tasks.
+  * New function char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 
 Changes:
   * Data interfaces (variable, vector, matrix and block) now define
@@ -84,11 +90,21 @@ Changes:
     starpu_data_set_tag(), data are received as a raw memory)
   * StarPU-MPI: Fix for being able to receive data with the same tag
     from several nodes (see mpi/tests/gather.c)
+  * Remove the long-deprecated cost_model fields and task->buffers field.
+  * Fix complexity of implicit task/data dependency, from quadratic to linear.
 
 Small changes:
   * Rename function starpu_trace_user_event() as
     starpu_fxt_trace_user_event()
 
+StarPU 1.1.3 (svn revision xxx)
+==============================================
+The scheduling context release
+
+New features:
+  * One can register an existing on-GPU buffer to be used by a handle.
+  * Add the starpu_paje_summary statistics tool.
+
 StarPU 1.1.2 (svn revision xxx)
 ==============================================
 The scheduling context release
@@ -100,6 +116,8 @@ New features:
     scheduler.
   * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
     number of calibration measurements.
+  * Add STARPU_TRACE_BUFFER_SIZE environment variable to specify the size of
+    the trace buffer.
 
 StarPU 1.1.1 (svn revision 12638)
 ==============================================

+ 22 - 4
configure.ac

@@ -88,11 +88,21 @@ AC_C_RESTRICT
 AC_CHECK_PROGS([BASH], [bash])
 
 # Check whether subversion is installed
+AC_PATH_PROG(svncommand, svn)
 AC_PATH_PROG(svnversioncommand, svnversion)
 
+# find out if we are are in a subversion directory
+svndir=0
+if test "$svncommand" != "" ; then
+   $svncommand info $srcdir >/dev/null 2>&1
+   if test $? -eq 0; then
+      svndir=1
+   fi
+fi
+
 # use svnversion to record the current repository revision only if
 # subversion is installed and we are in a working copy
-if test "$svnversioncommand" = "" || test `LC_ALL=C $svnversioncommand -n $srcdir` = "exported" ; then
+if test "$svnversioncommand" = "" || test "`LC_ALL=C $svnversioncommand -n $srcdir`" = "exported" ; then
    if test -f $srcdir/STARPU-REVISION ; then
       cp $srcdir/STARPU-REVISION .
    else
@@ -978,6 +988,7 @@ if test x$enable_simgrid = xyes ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
+	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
 	fi
 	if test -n "$SIMGRID_LIBS" ; then
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
@@ -985,11 +996,13 @@ if test x$enable_simgrid = xyes ; then
 	if test "$simgrid_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
 	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
+	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
 	fi
 	if test "$simgrid_include_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
 	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
+	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
 	fi
 	if test "$simgrid_lib_dir" != "no" ; then
 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
@@ -1758,7 +1771,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 running_mpi_check=no
-if test -d "$srcdir/.svn" -o -d "$srcdir/.git" ; then
+if test $svndir = 1 -o -d "$srcdir/.git" ; then
     running_mpi_check=yes
 fi
 if test x$enable_mpi_check = xyes ; then
@@ -2108,7 +2121,8 @@ AC_ARG_ENABLE(blas-lib,
  [  --enable-blas-lib[=blaslibname]:
                       none [default]: no BLAS lib is used
                       atlas: use ATLAS library
-                      goto: use GotoBLAS library],
+                      goto: use GotoBLAS library
+		      mkl: use MKL library (you may need to set specific CFLAGS and LDFLAGS with --with-mkl-cflags and --with-mkl-ldflags)],
  [
      if   test "x$enableval" = "xatlas" ; then
         blas_lib=atlas
@@ -2116,6 +2130,8 @@ AC_ARG_ENABLE(blas-lib,
         blas_lib=goto
      elif test "x$enableval" = "xnone" ; then
         blas_lib=none
+     elif test "x$enableval" = "xmkl" ; then
+        blas_lib=mkl
      elif test x$enableval = xno; then
 	blas_lib=none
      else
@@ -2173,7 +2189,7 @@ if test x$blas_lib = xmaybe -o x$blas_lib = xatlas; then
     fi
 fi
 
-if test x$blas_lib = xmaybe; then
+if test x$blas_lib = xmaybe -o x$blas_lib = xmkl; then
     # Should we use MKL ?
     AC_ARG_WITH(mkl-cflags, [AS_HELP_STRING([--with-mkl-cflags], [specify MKL compilation flags])],
 	[
@@ -2452,6 +2468,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tools/starpu_workers_activity
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_state_stats
+  chmod +x tools/starpu_paje_summary
   chmod +x doc/doxygen/doxygen_filter.sh
 ])
 
@@ -2481,6 +2498,7 @@ AC_OUTPUT([
 	tools/starpu_workers_activity
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_state_stats
+	tools/starpu_paje_summary
 	socl/Makefile
 	socl/src/Makefile
 	socl/examples/Makefile

+ 2 - 0
doc/doxygen/Makefile.am

@@ -237,6 +237,8 @@ $(DOX_TAG): $(dox_inputs)
 	@if test -f html/navtree.js ; then $(SED) -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
 	@$(SED) -i 's/.*"Files.html".*//' html/pages.html
 	@if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
+	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
+	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 
 dist_pdf_DATA = $(DOX_PDF)
 

+ 9 - 0
doc/doxygen/chapters/01building.doxy

@@ -108,6 +108,15 @@ $ cd build
 $ ../configure
 \endverbatim
 
+By default, StarPU will be installed in <c>/usr/local/bin</c>,
+<c>/usr/local/lib</c>, etc. You can specify an installation prefix
+other than <c>/usr/local</c> using the option <c>--prefix</c>, for
+instance:
+
+\verbatim
+$ ../configure --prefix=$HOME/starpu
+\endverbatim
+
 \subsection BuildingStarPU Building StarPU
 
 \verbatim

+ 4 - 4
doc/doxygen/chapters/02basic_examples.doxy

@@ -156,7 +156,7 @@ int main(int argc, char **argv)
 
 \verbatim
 $ make hello_world
-cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
+cc $(pkg-config --cflags starpu-1.2) hello_world.c -o hello_world $(pkg-config --libs starpu-1.2)
 $ ./hello_world
 Hello world
 \endverbatim
@@ -234,7 +234,7 @@ int main(int argc, char **argv)
 
 \verbatim
 $ make hello_world
-cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
+cc $(pkg-config --cflags starpu-1.2) hello_world.c -o hello_world $(pkg-config --libs starpu-1.2)
 $ ./hello_world
 Hello world (params = {1, 2.000000} )
 \endverbatim
@@ -285,7 +285,7 @@ int main(int argc, char **argv)
 
 \verbatim
 $ make hello_world
-cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
+cc $(pkg-config --cflags starpu-1.2) hello_world.c -o hello_world $(pkg-config --libs starpu-1.2) 
 $ ./hello_world
 Hello world
 Callback function (arg 42)
@@ -606,7 +606,7 @@ pointer.
 
 \verbatim
 $ make vector_scal
-cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2)  vector_scal.c   -o vector_scal
+cc $(pkg-config --cflags starpu-1.2) vector_scal.c -o vector_scal $(pkg-config --libs starpu-1.2)
 $ ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 \endverbatim

+ 1 - 1
doc/doxygen/chapters/07data_management.doxy

@@ -120,7 +120,7 @@ starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                             NX, sizeof(vector[0]));
 
 /* Partition the vector in PARTS sub-vectors */
-starpu_data_filter f =
+struct starpu_data_filter f =
 {
     .filter_func = starpu_vector_filter_block,
     .nchildren = PARTS

+ 4 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -128,6 +128,10 @@ the user can for instance run a given task a thousand times, measure the global
 consumption for that series of tasks, divide it by a thousand, repeat for
 varying kinds of tasks and task sizes, and eventually feed StarPU
 with these manual measurements through starpu_perfmodel_update_history().
+For instance, for CUDA devices, <c>nvidia-smi -q -d POWER</c> can be used to get
+the current consumption in Watt. Multiplying that value by the average duration
+of a single task gives the consumption of the task in Joules, which can be given
+to starpu_perfmodel_update_history().
 
 \section StaticScheduling Static Scheduling
 

+ 10 - 0
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -337,6 +337,7 @@ $ R
 > install.packages("plyr")
 > install.packages("ggplot2")
 > install.packages("data.table")
+> install.packages("knitr")
 \endverbatim
 
 The pj_dump tool from pajeng is also needed (see
@@ -364,6 +365,15 @@ and see the resulting pdf file:
 \image html paje_draw_histogram.png
 \image latex paje_draw_histogram.eps "" width=\textwidth
 
+A quick statistical report can be generated by using:
+
+\verbatim
+$ starpu_paje_summary native.trace simgrid.trace
+\endverbatim
+
+it includes gantt charts, execution summaries, as well as state duration charts
+and time distribution histograms.
+
 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
 
 StarPU can record a trace of what tasks are needed to complete the

+ 5 - 5
doc/doxygen/chapters/18mic_scc_support.doxy

@@ -6,13 +6,13 @@
  * See the file version.doxy for copying conditions.
  */
 
-/*! \page MICSCCSupport MIC/SCC Support
+/*! \page MICSCCSupport MIC Xeon Phi / SCC Support
 
 \section Compilation Compilation
 
 SCC support just needs the presence of the RCCE library.
 
-MIC support actually needs two compilations of StarPU, one for the host and one for
+MIC Xeon Phi support actually needs two compilations of StarPU, one for the host and one for
 the device. The PATH environment variable has to include the path to the
 cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c>
 The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
@@ -20,12 +20,12 @@ calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c>
 <c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
 recurse into both directories.
 
-\section PortingApplicationsToMICSCC Porting Applications To MIC/SCC
+\section PortingApplicationsToMICSCC Porting Applications To MIC Xeon Phi / SCC
 
-The simplest way to port an application to MIC/SCC is to set the field
+The simplest way to port an application to MIC Xeon Phi or SCC is to set the field
 starpu_codelet::cpu_funcs_name, to provide StarPU with the function
 name of the CPU implementation. StarPU will thus simply use the
-existing CPU implementation (cross-rebuilt in the MIC case). The
+existing CPU implementation (cross-rebuilt in the MIC Xeon Phi case). The
 functions have to be globally-visible (i.e. not <c>static</c>) for
 StarPU to be able to look them up.
 

+ 9 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -548,6 +548,15 @@ intended to be used for experimental purposes as it emulates devices
 that have a limited amount of memory.
 </dd>
 
+<dt>STARPU_TRACE_BUFFER_SIZE</dt>
+<dd>
+\anchor STARPU_TRACE_BUFFER_SIZE
+\addindex __env__STARPU_TRACE_BUFFER_SIZE
+This sets the buffer size for recording trace events in MiB. Setting it to a big
+size allows to avoid pauses in the trace while it is recorded on the disk. This
+however also consumes memory, of course. The default value is 64.
+</dd>
+
 <dt>STARPU_GENERATE_TRACE</dt>
 <dd>
 \anchor STARPU_GENERATE_TRACE

+ 63 - 7
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -115,6 +115,11 @@ Defines the maximum number of buffers that tasks will be able to take
 as parameters. The default value is 8, it can be changed by using the
 configure option \ref enable-maxbuffers "--enable-maxbuffers".
 
+\def STARPU_VARIABLE_NBUFFERS
+\ingroup API_Codelet_And_Tasks
+Value to set in starpu_codelet::nbuffers to specify that the codelet can accept
+a variable number of buffers, specified in starpu_task::nbuffers.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
@@ -270,11 +275,14 @@ starpu_codelet::cpu_funcs_name is non-NULL, in which case StarPU will
 simply make a symbol lookup to get the implementation.
 
 \var starpu_codelet::nbuffers
-Specify the number of arguments taken by the codelet. These arguments
-are managed by the DSM and are accessed from the <c>void *buffers[]</c>
-array. The constant argument passed with the field starpu_task::cl_arg
-is not counted in this number. This value should not be above
-\ref STARPU_NMAXBUFS.
+Specify the number of arguments taken by the codelet. These arguments are
+managed by the DSM and are accessed from the <c>void *buffers[]</c> array. The
+constant argument passed with the field starpu_task::cl_arg is not counted in
+this number. This value should not be above \ref STARPU_NMAXBUFS. It may be set
+to STARPU_VARIABLE_NBUFFERS to specify that the number of buffers and their
+access modes will be set in starpu_task::nbuffers and starpu_task::modes or
+starpu_task::dyn_modes, which thus permits to define codelets with a varying
+number of data.
 
 \var starpu_codelet::modes
 Is an array of ::starpu_data_access_mode. It describes the required
@@ -362,27 +370,33 @@ starpu_task_create(), or declared statically. In the latter case, the
 programmer has to zero the structure starpu_task and to fill the
 different fields properly. The indicated default values correspond to
 the configuration of a task allocated with starpu_task_create().
+
 \var starpu_task::name
 Optional name of the task. This can be useful for debugging
 purposes.
+
 \var starpu_task::cl
 Is a pointer to the corresponding structure starpu_codelet. This
 describes where the kernel should be executed, and supplies the
 appropriate implementations. When set to NULL, no code is executed
 during the tasks, such empty tasks can be useful for synchronization
 purposes.
-\var starpu_task::buffers
-\deprecated
 This field has been made deprecated. One should use instead the
 field starpu_task::handles to specify the data handles accessed
 by the task. The access modes are now defined in the field
 starpu_codelet::modes.
+
+\var starpu_task::nbuffers
+Specifies the number of buffers. This is only used when starpu_codelet::nbuffers
+is STARPU_VARIABLE_NBUFFERS.
+
 \var starpu_task::handles
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 different pieces of data accessed by the task. The number of entries
 in this array must be specified in the field starpu_codelet::nbuffers,
 and should not exceed \ref STARPU_NMAXBUFS. If unsufficient, this value can
 be set with the configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
 \var starpu_task::dyn_handles
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 different pieces of data accessed by the task. The number of entries
@@ -401,6 +415,25 @@ The actual data pointers to the memory node where execution will
 happen, managed by the DSM. Is used when the field
 starpu_task::dyn_handles is defined.
 
+\var starpu_task::modes
+Is used only when starpu_codelet::nbuffers is STARPU_VARIABLE_NBUFFERS.
+It is an array of ::starpu_data_access_mode. It describes the required
+access modes to the data neeeded by the codelet (e.g. ::STARPU_RW). The
+number of entries in this array must be specified in the field
+starpu_task::nbuffers, and should not exceed \ref STARPU_NMAXBUFS. If
+unsufficient, this value can be set with the configure option
+\ref enable-maxbuffers "--enable-maxbuffers".
+
+\var starpu_task::dyn_modes
+Is used only when starpu_codelet::nbuffers is STARPU_VARIABLE_NBUFFERS.
+It is an array of ::starpu_data_access_mode. It describes the required
+access modes to the data needed by the codelet (e.g. ::STARPU_RW).
+The number of entries in this array must be specified in the field
+starpu_codelet::nbuffers. This field should be used for codelets having a
+number of datas greater than \ref STARPU_NMAXBUFS (see \ref
+SettingTheDataHandlesForATask). When defining a codelet, one
+should either define this field or the field starpu_task::modes defined above.
+
 \var starpu_task::cl_arg
 Optional pointer which is passed to the codelet through the second
 argument of the codelet implementation (e.g. starpu_codelet::cpu_func
@@ -612,6 +645,11 @@ It is possible to initialize statically allocated tasks with
 this value. This is equivalent to initializing a structure starpu_task
 with the function starpu_task_init() function.
 
+\def STARPU_TASK_GET_NBUFFERS(task)
+\ingroup API_Codelet_And_Tasks
+Return the number of buffers for this task, i.e. starpu_codelet::nbuffers, or
+starpu_task::nbuffers if the former is STARPU_VARIABLE_BUFFERS.
+
 \def STARPU_TASK_GET_HANDLE(task, i)
 \ingroup API_Codelet_And_Tasks
 Return the \p i th data handle of the given task. If the task
@@ -647,6 +685,24 @@ starpu_codelet::modes or the \p i th element of the field
 starpu_codelet::dyn_modes (see \ref
 SettingTheDataHandlesForATask)
 
+\def STARPU_TASK_GET_MODE(codelet, i)
+\ingroup API_Codelet_And_Tasks
+Return the access mode of the \p i th data handle of the given
+task. If the task is defined with a static or dynamic number of
+handles, will either return the \p i th element of the field
+starpu_task::modes or the \p i th element of the field
+starpu_task::dyn_modes (see \ref
+SettingTheDataHandlesForATask)
+
+\def STARPU_TASK_SET_MODE(task, mode, i)
+\ingroup API_Codelet_And_Tasks
+Set the access mode of the \p i th data handle of the given
+task. If the task is defined with a static or dynamic number of
+handles, will either set the \p i th element of the field
+starpu_task::modes or the \p i th element of the field
+starpu_task::dyn_modes (see \ref
+SettingTheDataHandlesForATask)
+
 \fn struct starpu_task *starpu_task_create(void)
 \ingroup API_Codelet_And_Tasks
 Allocate a task structure and initialize it with default

+ 26 - 0
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -29,6 +29,8 @@ Return a 32bit footprint which characterizes the data size.
 Compare the data size of two interfaces.
 \var starpu_data_interface_ops::display
 Dump the sizes of a handle to a file.
+\var starpu_data_interface_ops::describe
+Describe the data into a string.
 \var starpu_data_interface_ops::interfaceid
 An identifier that is unique to each interface.
 \var starpu_data_interface_ops::interface_size
@@ -241,6 +243,12 @@ starpu_data_handle_t var_handle;
 starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
 \endcode
 
+\fn void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset)
+\ingroup API_Data_Interfaces
+Register into the \p handle that to store data on node \p node it should use the
+buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
+(for OpenCL, notably)
+
 \fn void starpu_vector_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx elemsize-byte elements pointed to by \p ptr and initialize \p handle to represent it.
@@ -252,6 +260,12 @@ starpu_data_handle_t vector_handle;
 starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 \endcode
 
+\fn void starpu_vector_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset)
+\ingroup API_Data_Interfaces
+Register into the \p handle that to store data on node \p node it should use the
+buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
+(for OpenCL, notably)
+
 \fn void starpu_matrix_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx x \p  ny 2D matrix of \p elemsize-byte elements pointed
@@ -267,6 +281,12 @@ matrix = (float*)malloc(width * height * sizeof(float));
 starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
 \endcode
 
+\fn void starpu_matrix_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ld)
+\ingroup API_Data_Interfaces
+Register into the \p handle that to store data on node \p node it should use the
+buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
+(for OpenCL, notably), with \p ld elements between rows.
+
 \fn void starpu_block_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx x \p ny x \p nz 3D matrix of \p elemsize byte elements
@@ -281,6 +301,12 @@ block = (float*)malloc(nx*ny*nz*sizeof(float));
 starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 \endcode
 
+\fn void starpu_block_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ldy, uint32_t ldz)
+\ingroup API_Data_Interfaces
+Register into the \p handle that to store data on node \p node it should use the
+buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
+(for OpenCL, notably), with \p ldy elements between rows and \ldz elements between z planes.
+
 \fn void starpu_bcsr_data_register(starpu_data_handle_t *handle, unsigned home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize)
 \ingroup API_Data_Interfaces
 This variant of starpu_data_register() uses the BCSR (Blocked

+ 6 - 0
doc/doxygen/chapters/api/data_management.doxy

@@ -109,6 +109,12 @@ vector or matrix) which can be registered by the means of helper
 functions (e.g. starpu_vector_data_register() or
 starpu_matrix_data_register()).
 
+\fn void starpu_data_ptr_register(starpu_data_handle_t handle, unsigned node)
+\ingroup API_Data_Management
+Register that a buffer for \p handle on \p node will be set. This is typically
+used by starpu_*_ptr_register helpers before setting the interface pointers for
+this node, to tell the core that that is now allocated.
+
 \fn void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc)
 \ingroup API_Data_Management
 Register a new piece of data into the handle \p handledst with the

+ 0 - 4
doc/doxygen/chapters/api/performance_model.doxy

@@ -94,8 +94,6 @@ arch-specific factor.
 is the symbol name for the performance model, which will be used as
 file name to store the model. It must be set otherwise the model will
 be ignored.
-\var starpu_perfmodel::cost_model
-\deprecated
 This field is deprecated. Use instead the field starpu_perfmodel::cost_function field.
 \var starpu_perfmodel::cost_function
 Used by ::STARPU_COMMON: takes a task and implementation number, and
@@ -158,8 +156,6 @@ number of sample values for non-linear regression
 contains information about the performance model of a given
 arch.
 \ingroup API_Performance_Model
-\var starpu_perfmodel_per_arch::cost_model
-\deprecated
 This field is deprecated. Use instead the field
 starpu_perfmodel_per_arch::cost_function.
 \var starpu_perfmodel_per_arch::cost_function

+ 5 - 1
doc/doxygen/chapters/api/workers.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -222,4 +222,8 @@ Returns the type of the given node as defined by
 this function should be used in the allocation function to determine
 on which device the memory needs to be allocated.
 
+\fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
+\ingroup API_Workers_Properties
+Returns the given worker type as a string.
+
 */

+ 1 - 1
doc/doxygen/doxygen-config.cfg.in

@@ -66,6 +66,6 @@ EXAMPLE_PATH           = @top_srcdir@/doc/doxygen \
 
 INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
 
-LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
+#LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
 
 IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters

+ 1 - 49
doc/doxygen/refman.tex

@@ -1,52 +1,4 @@
-\documentclass{book}
-\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}
-\usepackage{makeidx}
-\usepackage{natbib}
-\usepackage{graphicx}
-\usepackage{multicol}
-\usepackage{float}
-\usepackage{listings}
-\usepackage{color}
-\usepackage{ifthen}
-\usepackage[table]{xcolor}
-\usepackage{textcomp}
-\usepackage{alltt}
-\usepackage{ifpdf}
-\usepackage{./version}
-\ifpdf
-\usepackage[pdftex,
-            pagebackref=true,
-            colorlinks=true,
-            linkcolor=blue,
-            unicode
-           ]{hyperref}
-\else
-\usepackage[ps2pdf,
-            pagebackref=true,
-            colorlinks=true,
-            linkcolor=blue,
-            unicode
-           ]{hyperref}
-\usepackage{pspicture}
-\fi
-\usepackage[utf8]{inputenc}
-\usepackage{mathptmx}
-\usepackage[scaled=.90]{helvet}
-\usepackage{courier}
-\usepackage{sectsty}
-\usepackage{amssymb}
-\usepackage[titles]{tocloft}
-\usepackage{doxygen}
-\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=8,numbers=left }
-\makeindex
-\setcounter{tocdepth}{3}
-\renewcommand{\familydefault}{\sfdefault}
-\hfuzz=15pt
-\setlength{\emergencystretch}{15pt}
-\hbadness=750
-\tolerance=750
-\begin{document}
-\hypersetup{pageanchor=false,citecolor=blue}
+\input{./version.sty}
 \begin{titlepage}
 \vspace*{4cm}
 {\Huge \textbf{StarPU Handbook}}\\

+ 16 - 2
doc/tutorial/Makefile

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@ HAS_OPENCL	=	$(shell pkg-config --libs starpu-1.1 |grep -i opencl)
 %.o: %.cu
 	nvcc $(CFLAGS) $< -c
 
-TARGETS = hello_world vector_scal hello_world_plugin vector_scal_plugin
+TARGETS = hello_world vector_scal hello_world_plugin vector_scal_plugin vector_scal_task_insert
 
 all: $(TARGETS)
 
@@ -42,6 +42,20 @@ endif
 vector_scal: $(VECTOR_SCAL_PREREQUISITES)
 	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
 
+VECTOR_SCAL_TASK_INSERT_PREREQUISITES		=	vector_scal_task_insert.o vector_scal_cpu.o
+ifneq ($(strip $(HAS_CUDA)),)
+VECTOR_SCAL_TASK_INSERT_PREREQUISITES		+=	vector_scal_cuda.o
+VECTOR_SCAL_TASK_INSERT_COMPILER		=	$(NVCC)
+else
+VECTOR_SCAL_TASK_INSERT_COMPILER		=	$(CC)
+endif
+ifneq ($(strip $(HAS_OPENCL)),)
+VECTOR_SCAL_TASK_INSERT_PREREQUISITES += vector_scal_opencl.o
+endif
+
+vector_scal_task_insert: $(VECTOR_SCAL_TASK_INSERT_PREREQUISITES)
+	$(VECTOR_SCAL_TASK_INSERT_COMPILER) $(LDFLAGS) $^ -o $@
+
 hello_world_plugin: hello_world_plugin.c
 	$(CC) $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $^ -o $@
 

+ 117 - 0
doc/tutorial/vector_scal_task_insert.c

@@ -0,0 +1,117 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU to scale an array by a factor.
+ * It shows how to manipulate data with StarPU's data management library.
+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
+ *  2- how to submit a task to StarPU
+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
+ */
+#include <starpu.h>
+
+#define    NX    2048
+
+extern void vector_scal_cpu(void *buffers[], void *_args);
+extern void vector_scal_cuda(void *buffers[], void *_args);
+extern void vector_scal_opencl(void *buffers[], void *_args);
+
+static struct starpu_codelet cl = {
+	/* CPU implementation of the codelet */
+	.cpu_funcs = {vector_scal_cpu, NULL},
+#ifdef STARPU_USE_CUDA
+	/* CUDA implementation of the codelet */
+	.cuda_funcs = {vector_scal_cuda, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	/* OpenCL implementation of the codelet */
+	.opencl_funcs = {vector_scal_opencl, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program programs;
+#endif
+
+int main(int argc, char **argv)
+{
+	/* We consider a vector of float that is initialized just as any of C
+	 * data */
+	float vector[NX];
+	unsigned i;
+	for (i = 0; i < NX; i++)
+		vector[i] = 1.0f;
+
+	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+
+	/* Initialize StarPU with default configuration */
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
+#endif
+
+	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
+	 * identifier. When a task needs to access a piece of data, it should
+	 * refer to the handle that is associated to it.
+	 * In the case of the "vector" data interface:
+	 *  - the first argument of the registration method is a pointer to the
+	 *    handle that should describe the data
+	 *  - the second argument is the memory node where the data (ie. "vector")
+	 *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
+	 *    opposed to an adress on a GPU for instance.
+	 *  - the third argument is the adress of the vector in RAM
+	 *  - the fourth argument is the number of elements in the vector
+	 *  - the fifth argument is the size of each element.
+	 */
+	starpu_data_handle_t vector_handle;
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
+				    NX, sizeof(vector[0]));
+
+	float factor = 3.14;
+
+	ret = starpu_task_insert(&cl,
+				 /* an argument is passed to the codelet, beware that this is a
+				  * READ-ONLY buffer and that the codelet may be given a pointer to a
+				  * COPY of the argument */
+				 STARPU_VALUE, &factor, sizeof(factor),
+				 /* the codelet manipulates one buffer in RW mode */
+				 STARPU_RW, vector_handle,
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+	/* Wait for tasks completion */
+	starpu_task_wait_for_all();
+
+	/* StarPU does not need to manipulate the array anymore so we can stop
+	 * monitoring it */
+	starpu_data_unregister(vector_handle);
+
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_unload_opencl(&programs);
+#endif
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+
+	return 0;
+}

+ 39 - 94
examples/Makefile.am

@@ -123,22 +123,22 @@ noinst_HEADERS = 				\
 # What to install and what to check #
 #####################################
 
-STARPU_EXAMPLES	=
-TESTS		=	$(STARPU_EXAMPLES)
-TESTS		+=	scheduler/schedulers.sh
+examplebin_PROGRAMS 	+=	$(STARPU_EXAMPLES)
 
+TESTS			=	$(STARPU_EXAMPLES)
+TESTS			+=	scheduler/schedulers.sh
 
 if STARPU_HAVE_WINDOWS
-check_PROGRAMS	=	$(STARPU_EXAMPLES)
+check_PROGRAMS		=	$(STARPU_EXAMPLES)
 else
-check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
+check_PROGRAMS		=	$(LOADER) $(STARPU_EXAMPLES)
 endif
 
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 if !STARPU_CROSS_COMPILING
 LOADER			=	loader
-loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
 else
@@ -155,22 +155,21 @@ endif
 
 endif
 
-examplebin_PROGRAMS +=				\
+# STARPU_EXAMPLES list all applications which have to be compiled and checked
+# Applications which should only be compiled are added directly in examplebin_PROGRAMS
+# see for instance mandelbrot/mandelbrot
+
+STARPU_EXAMPLES =				\
 	basic_examples/hello_world		\
 	basic_examples/vector_scal		\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
 	basic_examples/multiformat              \
-	basic_examples/dynamic_handles		\
 	cpp/incrementer_cpp			\
-	filters/custom_mf/custom_mf_filter      \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
-	filters/shadow				\
-	filters/shadow2d			\
-	filters/shadow3d			\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\
@@ -185,33 +184,28 @@ examplebin_PROGRAMS +=				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
+	scheduler/dummy_sched			\
 	sched_ctx/sched_ctx			\
-	sched_ctx/parallel_code			\
-	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/prio				\
-	sched_ctx/sched_ctx_without_sched_policy\
-	sched_ctx/nested_sched_ctxs		\
+	sched_ctx/dummy_sched_with_ctx		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
-	reductions/minmax_reduction		\
-	mandelbrot/mandelbrot			\
-	ppm_downscaler/ppm_downscaler		\
-	ppm_downscaler/yuv_downscaler
+	reductions/minmax_reduction
 
 if !STARPU_SIMGRID
-examplebin_PROGRAMS +=				\
+STARPU_EXAMPLES +=				\
 	scheduler/dummy_sched
 
 if STARPU_HAVE_F77_H
-examplebin_PROGRAMS +=				\
+STARPU_EXAMPLES +=				\
 	basic_examples/vector_scal_fortran	\
 	fortran/hello
 endif
 endif
 
 if !NO_BLAS_LIB
-examplebin_PROGRAMS +=				\
+STARPU_EXAMPLES +=				\
 	axpy/axpy				\
 	mult/sgemm 				\
 	mult/dgemm				\
@@ -229,7 +223,7 @@ examplebin_PROGRAMS +=				\
 endif
 
 if MKL_BLAS_LIB
-examplebin_PROGRAMS +=				\
+STARPU_EXAMPLES +=				\
 	lu/lu_example_complex_float		\
 	lu/lu_example_complex_double		\
 	lu/lu_implicit_example_complex_float	\
@@ -237,46 +231,10 @@ examplebin_PROGRAMS +=				\
 endif
 
 if ATLAS_BLAS_LIB
-examplebin_PROGRAMS +=				\
+STARPU_EXAMPLES +=				\
 	spmv/dw_block_spmv
 endif
 
-STARPU_EXAMPLES +=				\
-	basic_examples/hello_world		\
-	basic_examples/vector_scal		\
-	basic_examples/mult			\
-	basic_examples/block			\
-	basic_examples/variable			\
-	basic_examples/multiformat              \
-	cpp/incrementer_cpp			\
-	filters/fvector				\
-	filters/fblock				\
-	filters/fmatrix				\
-	tag_example/tag_example			\
-	tag_example/tag_example2		\
-	tag_example/tag_example3		\
-	tag_example/tag_example4		\
-	tag_example/tag_restartable		\
-	spmd/vector_scal_spmd			\
-	spmv/spmv				\
-	callback/callback			\
-	callback/prologue			\
-	incrementer/incrementer			\
-	binary/binary				\
-	interface/complex			\
-	matvecmult/matvecmult			\
-	profiling/profiling			\
-	scheduler/dummy_sched			\
-	sched_ctx/sched_ctx			\
-	sched_ctx/prio				\
-	sched_ctx/dummy_sched_with_ctx		\
-	sched_ctx/sched_ctx_without_sched_policy\
-	sched_ctx/nested_sched_ctxs		\
-	worker_collections/worker_tree_example  \
-	worker_collections/worker_list_example  \
-	reductions/dot_product			\
-	reductions/minmax_reduction
-
 if STARPU_LONG_CHECK
 STARPU_EXAMPLES +=				\
 	sched_ctx/parallel_code
@@ -288,35 +246,17 @@ STARPU_EXAMPLES +=				\
 	fortran/hello
 endif
 
-if !NO_BLAS_LIB
+if !STARPU_HAVE_WINDOWS
 STARPU_EXAMPLES +=				\
-	axpy/axpy				\
-	mult/sgemm 				\
-	mult/dgemm				\
-	cholesky/cholesky_tag			\
-	cholesky/cholesky_tile_tag		\
-	cholesky/cholesky_grain_tag		\
-	cholesky/cholesky_implicit		\
-	lu/lu_example_float			\
-	lu/lu_example_double			\
-	lu/lu_implicit_example_float		\
-	lu/lu_implicit_example_double		\
-	heat/heat				\
-	cg/cg					\
-	pipeline/pipeline
-endif
+	openmp/vector_scal_omp			\
+	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
+	sched_ctx/sched_ctx_without_sched_policy
 
-if MKL_BLAS_LIB
+if STARPU_LONG_CHECK
 STARPU_EXAMPLES +=				\
-	lu/lu_example_complex_float		\
-	lu/lu_example_complex_double		\
-	lu/lu_implicit_example_complex_float	\
-	lu/lu_implicit_example_complex_double
+	sched_ctx/parallel_code
 endif
-
-if ATLAS_BLAS_LIB
-STARPU_EXAMPLES +=				\
-	spmv/dw_block_spmv
 endif
 
 ##################
@@ -428,7 +368,6 @@ nobase_STARPU_OPENCL_DATA_DATA += 		\
 	basic_examples/variable_kernels_opencl_kernel.cl
 endif
 
-
 ###########
 # Filters #
 ###########
@@ -448,10 +387,16 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	filters/fblock_opencl_kernel.cl
 endif
 
-
 #############################
 # Custom multiformat filter #
 #############################
+
+#TODO: see why the application is failing
+#lt-custom_mf_filter: .../src/datawizard/malloc.c:784: starpu_free_on_node: Assertion `chunk != _starpu_chunk_list_end(chunks[dst_node])' failed.
+
+examplebin_PROGRAMS +=				\
+	filters/custom_mf/custom_mf_filter
+
 filters_custom_mf_custom_mf_filter_SOURCES=\
 	filters/custom_mf/custom_mf_filter.c \
 	filters/custom_mf/custom_interface.c   \
@@ -826,6 +771,9 @@ endif
 # Mandelbrot Set #
 ##################
 
+examplebin_PROGRAMS +=				\
+	mandelbrot/mandelbrot
+
 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
 if HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
@@ -846,9 +794,7 @@ top_hello_world_top_SOURCES =			\
 # Pi #
 ######
 
-check_PROGRAMS +=				\
-	pi/pi					\
-	pi/pi_redux
+if !STARPU_HAVE_WINDOWS
 
 examplebin_PROGRAMS +=				\
 	pi/pi					\
@@ -874,6 +820,7 @@ pi_pi_redux_SOURCES +=				\
 pi_pi_redux_LDADD =				\
 	$(STARPU_CURAND_LDFLAGS)
 endif
+endif
 
 ###########################
 # OpenGL interoperability #
@@ -915,9 +862,6 @@ endif
 ##################
 
 if !STARPU_HAVE_WINDOWS
-examplebin_PROGRAMS +=		\
-	openmp/vector_scal_omp
-
 openmp_vector_scal_omp_CFLAGS = \
 	$(AM_CFLAGS) -fopenmp
 
@@ -937,3 +881,4 @@ showcheck:
 	for i in $(SUBDIRS) ; do \
 		make -C $$i showcheck ; \
 	done
+

+ 0 - 1
examples/callback/prologue.c

@@ -17,7 +17,6 @@
 
 #include <starpu.h>
 #include <sys/time.h>
-#include <omp.h>
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 

+ 10 - 1
examples/openmp/vector_scal_omp.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -93,6 +93,7 @@ int main(int argc, char **argv)
 	conf.sched_policy_name = "pheft";
 
 	ret = starpu_init(&conf);
+	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_data_handle_t vector_handle;
@@ -111,6 +112,7 @@ int main(int argc, char **argv)
 		task->cl_arg_size = sizeof(factor);
 
 		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
@@ -122,5 +124,12 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
 	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
 
+	free(vector);
 	return 0;
+
+enodev:
+	starpu_data_unregister(vector_handle);
+	free(vector);
+	starpu_shutdown();
+	return 77;
 }

+ 5 - 1
examples/scheduler/schedulers.sh

@@ -23,7 +23,11 @@ check_success()
     fi
 }
 
-[ -x ./cholesky/cholesky_tag ] || exit 77
+if test ! -x ./cholesky/cholesky_tag
+then
+    echo "Application ./cholesky/cholesky_tag unavailable"
+    exit 77
+fi
 
 SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}'`
 

+ 2 - 0
include/starpu_config.h.in

@@ -55,6 +55,7 @@
 #undef STARPU_HAVE_MALLOC_H
 
 #undef STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP
+#undef STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP
 #undef STARPU_HAVE_SYNC_FETCH_AND_ADD
 #undef STARPU_HAVE_SYNC_FETCH_AND_OR
 #undef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET
@@ -87,6 +88,7 @@
 #undef STARPU_HAVE_LIBNUMA
 
 #undef STARPU_HAVE_WINDOWS
+#undef STARPU_LINUX_SYS
 #undef STARPU_HAVE_UNSETENV
 
 #ifdef _MSC_VER

+ 8 - 2
include/starpu_data_interfaces.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -111,6 +111,7 @@ struct starpu_data_interface_ops
 	uint32_t 	 (*footprint)			(starpu_data_handle_t handle);
 	int 		 (*compare)			(void *data_interface_a, void *data_interface_b);
 	void 		 (*display)			(starpu_data_handle_t handle, FILE *f);
+	ssize_t		 (*describe)			(void *data_interface, char *buf, size_t size);
 	enum starpu_data_interface_id interfaceid;
 	size_t interface_size;
 
@@ -124,6 +125,7 @@ struct starpu_data_interface_ops
 int starpu_data_interface_get_next_id(void);
 
 void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node, void *data_interface, struct starpu_data_interface_ops *ops);
+void starpu_data_ptr_register(starpu_data_handle_t handle, unsigned node);
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
 
 void *starpu_data_handle_to_pointer(starpu_data_handle_t handle, unsigned node);
@@ -147,6 +149,7 @@ struct starpu_matrix_interface
 };
 
 void starpu_matrix_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize);
+void starpu_matrix_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ld);
 uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle);
 uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle);
 uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle);
@@ -215,6 +218,7 @@ struct starpu_block_interface
 };
 
 void starpu_block_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize);
+void starpu_block_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ldy, uint32_t ldz);
 uint32_t starpu_block_get_nx(starpu_data_handle_t handle);
 uint32_t starpu_block_get_ny(starpu_data_handle_t handle);
 uint32_t starpu_block_get_nz(starpu_data_handle_t handle);
@@ -245,6 +249,7 @@ struct starpu_vector_interface
 };
 
 void starpu_vector_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t nx, size_t elemsize);
+void starpu_vector_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset);
 uint32_t starpu_vector_get_nx(starpu_data_handle_t handle);
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
@@ -266,6 +271,7 @@ struct starpu_variable_interface
 };
 
 void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size);
+void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset);
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 

+ 0 - 2
include/starpu_perfmodel.h

@@ -93,7 +93,6 @@ struct starpu_perfmodel_history_table;
 
 struct starpu_perfmodel_per_arch
 {
-	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
@@ -119,7 +118,6 @@ struct starpu_perfmodel
 {
 	enum starpu_perfmodel_type type;
 
-	double (*cost_model)(struct starpu_data_descr *) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);

+ 2 - 0
include/starpu_sched_ctx.h

@@ -135,6 +135,8 @@ void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
 
 void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
 
+int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */

+ 25 - 9
include/starpu_task.h

@@ -79,6 +79,8 @@ typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
 #define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS ((starpu_opencl_func_t) -1)
 
+#define STARPU_VARIABLE_NBUFFERS (-1)
+
 struct starpu_task;
 struct starpu_codelet
 {
@@ -101,7 +103,7 @@ struct starpu_codelet
 
 	char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
-	unsigned nbuffers;
+	int nbuffers;
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode *dyn_modes;
 
@@ -123,14 +125,15 @@ struct starpu_task
 
 	struct starpu_codelet *cl;
 
-	/* TODO: remove someday, this is costly */
-	struct starpu_data_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
+	int nbuffers;
 
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
+	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 
 	starpu_data_handle_t *dyn_handles;
 	void **dyn_interfaces;
+	enum starpu_data_access_mode *dyn_modes;
 
 	void *cl_arg;
 	size_t cl_arg_size;
@@ -220,17 +223,30 @@ struct starpu_task
 	.prefetched = 0,					\
 	.dyn_handles = NULL,				\
 	.dyn_interfaces = NULL,				\
+	.dyn_modes = NULL,				\
 	.name = NULL                        		\
 }
 
-#define STARPU_TASK_GET_HANDLE(task, i) ((task->dyn_handles) ? task->dyn_handles[i] : task->handles[i])
-#define STARPU_TASK_SET_HANDLE(task, handle, i) do { if (task->dyn_handles) task->dyn_handles[i] = handle; else task->handles[i] = handle; } while(0)
+#define STARPU_TASK_GET_NBUFFERS(task) ((unsigned)((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? ((task)->nbuffers) : ((task)->cl->nbuffers)))
+
+#define STARPU_TASK_GET_HANDLE(task, i) (((task)->dyn_handles) ? (task)->dyn_handles[i] : (task)->handles[i])
+#define STARPU_TASK_SET_HANDLE(task, handle, i) do { if ((task)->dyn_handles) (task)->dyn_handles[i] = handle; else (task)->handles[i] = handle; } while(0)
+
+#define STARPU_CODELET_GET_MODE(codelet, i) (((codelet)->dyn_modes) ? (codelet)->dyn_modes[i] : (codelet)->modes[i])
+#define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if ((codelet)->dyn_modes) (codelet)->dyn_modes[i] = mode; else (codelet)->modes[i] = mode; } while(0)
 
-#define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
-#define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
+#define STARPU_TASK_GET_MODE(task, i) ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? \
+						(((task)->dyn_modes) ? (task)->dyn_modes[i] : (task)->modes[i]) : \
+						STARPU_CODELET_GET_MODE((task)->cl, i) )
+#define STARPU_TASK_SET_MODE(task, mode, i) do { \
+					if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS) \
+						if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
+					else \
+						STARPU_CODELET_SET_MODE((task)->cl, mode, i); \
+					} while(0)
 
-#define STARPU_CODELET_GET_NODE(codelet, i) ((codelet->dyn_nodes) ? codelet->dyn_nodes[i] : codelet->nodes[i])
-#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if (codelet->dyn_nodes) codelet->dyn_nodes[i] = __node; else codelet->nodes[i] = __node; } while(0)
+#define STARPU_CODELET_GET_NODE(codelet, i) (((codelet)->dyn_nodes) ? (codelet)->dyn_nodes[i] : (codelet)->nodes[i])
+#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if ((codelet)->dyn_nodes) (codelet)->dyn_nodes[i] = __node; else (codelet)->nodes[i] = __node; } while(0)
 
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);

+ 3 - 1
include/starpu_thread.h

@@ -236,12 +236,14 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier);
  * Encapsulation of the pthread_spin_* functions.
  */
 
-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK)
+#if defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK)
 
 typedef struct
 {
 #ifdef STARPU_SIMGRID
 	int taken;
+#elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+	unsigned taken STARPU_ATTRIBUTE_ALIGNED(16);
 #else /* we only have a trivial implementation yet ! */
 	uint32_t taken STARPU_ATTRIBUTE_ALIGNED(16);
 #endif

+ 6 - 0
include/starpu_util.h

@@ -176,6 +176,12 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)) == (old))
 #endif
 
+#ifdef STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP
+#define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value)  (__sync_val_compare_and_swap ((ptr), (old), (value)))
+#elif defined(STARPU_HAVE_XCHG)
+#define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)))
+#endif
+
 #ifdef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET
 #define STARPU_TEST_AND_SET(ptr, value) (__sync_lock_test_and_set ((ptr), (value)))
 #define STARPU_RELEASE(ptr) (__sync_lock_release ((ptr)))

+ 3 - 0
include/starpu_worker.h

@@ -116,6 +116,9 @@ struct starpu_tree* starpu_workers_get_tree(void);
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
 
 unsigned starpu_worker_is_slave(int workerid);
+
+char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
+
 #ifdef __cplusplus
 }
 #endif

+ 13 - 0
m4/acinclude.m4

@@ -42,6 +42,19 @@ AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
 	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
   fi])
 
+dnl Check whether the target supports __sync_val_compare_and_swap.
+AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
+  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
+		 ac_cv_have_sync_val_compare_and_swap, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
+			[ac_cv_have_sync_val_compare_and_swap=yes],
+			[ac_cv_have_sync_val_compare_and_swap=no])])
+  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
+	      [Define to 1 if the target supports __sync_val_compare_and_swap])
+  fi])
+
 dnl Check whether the target supports __sync_fetch_and_add.
 AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
   AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],

+ 1 - 1
mpi/examples/Makefile.am

@@ -152,7 +152,7 @@ mpi_lu_plu_implicit_example_double_LDADD =	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 mpi_lu_plu_implicit_example_double_SOURCES =	\
-	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_implicit_example_double.c	\
 	mpi_lu/plu_solve_double.c		\
 	mpi_lu/pdlu_kernels.c			\
 	mpi_lu/pdlu_implicit.c			\

+ 1 - 1
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -38,7 +38,7 @@ static unsigned check = 0;
 static int p = 1;
 static int q = 1;
 static unsigned display = 0;
-static char *path = "/tmp/starpu-mpi_LU";
+static char *path = "./starpu-ooc-files";
 
 #ifdef STARPU_HAVE_LIBNUMA
 static unsigned numa = 0;

+ 2 - 2
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -133,14 +133,14 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 	{
 		create_task_11(k);
 
-		starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,k));
-
 		for (i = k+1; i<nblocks; i++)
 		{
 			create_task_12(k, i);
 			create_task_21(k, i);
 		}
 
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,k));
+
 		for (i = k+1; i<nblocks; i++)
 		{
 			for (j = k+1; j<nblocks; j++)

+ 3 - 2
mpi/examples/stencil/stencil5.c

@@ -165,11 +165,12 @@ int main(int argc, char **argv)
 		{
 			int mpi_rank = my_distrib2(x, y, size);
 			if (!data_handles[x][y] && (mpi_rank == my_rank
-				 || my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
-				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size)))
+				 || my_rank == my_distrib2(x+1, y, size) || my_rank == my_distrib2(x-1, y, size)
+				 || my_rank == my_distrib2(x, y+1, size) || my_rank == my_distrib2(x, y-1, size)))
 			{
 				/* Register newly-needed data */
 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
 			}
 			if (data_handles[x][y] && mpi_rank != starpu_data_get_rank(data_handles[x][y]))
 			{

+ 1 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -494,7 +494,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	{
 		/* Get the number of buffers and the size of the arguments */
 		va_copy(varg_list_copy, varg_list);
-		arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list_copy);
+		_starpu_task_insert_get_args_size(varg_list_copy, NULL, &arg_buffer_size);
 		va_end(varg_list_copy);
 
 		/* Pack arguments if needed */

+ 36 - 8
src/common/fxt.h

@@ -109,6 +109,7 @@
 #define _STARPU_FUT_THREAD_EVENT	0x513d
 
 #define	_STARPU_FUT_CODELET_DETAILS	0x513e
+#define	_STARPU_FUT_CODELET_DATA	0x513f
 
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
@@ -165,6 +166,9 @@
 #define _STARPU_FUT_WORKER_SCHEDULING_PUSH	0x5166
 #define _STARPU_FUT_WORKER_SCHEDULING_POP	0x5167
 
+#define	_STARPU_FUT_START_EXECUTING	0x5168
+#define	_STARPU_FUT_END_EXECUTING	0x5169
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -409,31 +413,53 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 1, model_name); \
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, workerid, 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
+		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, workerid, 0); \
 	}								\
 	{								\
+		if ((job)->task->cl)					\
+		{							\
+			const int __nbuffers = STARPU_TASK_GET_NBUFFERS((job)->task);	\
+			char __buf[FXT_MAX_PARAMS*sizeof(long)];	\
+			int __i;					\
+			for (__i = 0; __i < __nbuffers; __i++)		\
+			{						\
+				starpu_data_handle_t __handle = STARPU_TASK_GET_HANDLE((job)->task, __i);	\
+				void *__interface = _STARPU_TASK_GET_INTERFACES((job)->task)[__i];	\
+				if (__handle->ops->describe)		\
+				{					\
+					__handle->ops->describe(__interface, __buf, sizeof(__buf));	\
+					_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_CODELET_DATA, workerid, __buf);	\
+				}					\
+			}						\
+		}							\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid);	\
 	}								\
 } while(0);
 
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype, workerid)			\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, _starpu_gettid());	\
+	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
 } while(0);
 
+#define _STARPU_TRACE_START_EXECUTING()				\
+	FUT_DO_PROBE1(_STARPU_FUT_START_EXECUTING, _starpu_gettid());
+
+#define _STARPU_TRACE_END_EXECUTING()				\
+	FUT_DO_PROBE1(_STARPU_FUT_END_EXECUTING, _starpu_gettid());
+
 #define _STARPU_TRACE_START_CALLBACK(job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
 
@@ -791,8 +817,10 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)	do {} while(0)
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a, workerid)	do {} while(0)
+#define _STARPU_TRACE_START_EXECUTING()	do {} while(0)
+#define _STARPU_TRACE_END_EXECUTING()	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
 #define _STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0)

+ 92 - 8
src/common/thread.c

@@ -23,6 +23,21 @@
 #include <xbt/synchro_core.h>
 #endif
 
+#if defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+#include <linux/futex.h>
+#include <sys/syscall.h>
+
+/* Private futexes are not so old, cope with old kernels.  */
+#ifdef FUTEX_WAIT_PRIVATE
+static int _starpu_futex_wait = FUTEX_WAIT_PRIVATE;
+static int _starpu_futex_wake = FUTEX_WAKE_PRIVATE;
+#else
+static int _starpu_futex_wait = FUTEX_WAIT;
+static int _starpu_futex_wake = FUTEX_WAKE;
+#endif
+
+#endif
+
 #ifdef STARPU_SIMGRID
 
 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
@@ -490,15 +505,15 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
 }
 #endif /* STARPU_SIMGRID, _MSC_VER, STARPU_HAVE_PTHREAD_BARRIER */
 
-#if defined(STARPU_SIMGRID) || !defined(HAVE_PTHREAD_SPIN_LOCK)
+#if defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(HAVE_PTHREAD_SPIN_LOCK)
 
-int starpu_pthread_spin_init(starpu_pthread_spinlock_t *lock, int pshared)
+int starpu_pthread_spin_init(starpu_pthread_spinlock_t *lock, int pshared STARPU_ATTRIBUTE_UNUSED)
 {
 	lock->taken = 0;
 	return 0;
 }
 
-int starpu_pthread_spin_destroy(starpu_pthread_spinlock_t *lock)
+int starpu_pthread_spin_destroy(starpu_pthread_spinlock_t *lock STARPU_ATTRIBUTE_UNUSED)
 {
 	/* we don't do anything */
 	return 0;
@@ -519,7 +534,53 @@ int starpu_pthread_spin_lock(starpu_pthread_spinlock_t *lock)
 		MSG_process_sleep(0.000001);
 		STARPU_UYIELD();
 	}
-#else
+#elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+	if (STARPU_VAL_COMPARE_AND_SWAP(&lock->taken, 0, 1) == 0)
+		/* Got it on first try! */
+		return 0;
+
+	/* Busy, spin a bit.  */
+	unsigned i;
+	for (i = 0; i < 128; i++)
+	{
+		/* Pause a bit before retrying */
+		STARPU_UYIELD();
+		/* And synchronize with other threads */
+		STARPU_SYNCHRONIZE();
+		if (!lock->taken)
+			/* Holder released it, try again */
+			if (STARPU_VAL_COMPARE_AND_SWAP(&lock->taken, 0, 1) == 0)
+				/* Got it! */
+				return 0;
+	}
+
+	/* We have spent enough time with spinning, let's block */
+	while (1)
+	{
+		/* Tell releaser to wake us */
+		unsigned prev = starpu_xchg(&lock->taken, 2);
+		if (prev == 0)
+			/* Ah, it just got released and we actually acquired
+			 * it!
+			 * Note: the sad thing is that we have just written 2,
+			 * so will spuriously try to wake a thread on unlock,
+			 * but we can not avoid it since we do not know whether
+			 * there are other threads sleeping or not.
+			 */
+			return 0;
+
+		/* Now start sleeping (unless it was released in between)
+		 * We are sure to get woken because either
+		 * - some thread has not released the lock yet, and lock->taken
+		 *   is 2, so it will wake us.
+		 * - some other thread started blocking, and will set
+		 *   lock->taken back to 2
+		 */
+		if (syscall(SYS_futex, &lock->taken, _starpu_futex_wait, 2, NULL, NULL, 0))
+			if (errno == ENOSYS)
+				_starpu_futex_wait = FUTEX_WAIT;
+	}
+#else /* !SIMGRID && !LINUX */
 	uint32_t prev;
 	do
 	{
@@ -539,7 +600,11 @@ int starpu_pthread_spin_trylock(starpu_pthread_spinlock_t *lock)
 		return EBUSY;
 	lock->taken = 1;
 	return 0;
-#else
+#elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+	unsigned prev;
+	prev = STARPU_VAL_COMPARE_AND_SWAP(&lock->taken, 0, 1);
+	return (prev == 0)?0:EBUSY;
+#else /* !SIMGRID && !LINUX */
 	uint32_t prev;
 	prev = STARPU_TEST_AND_SET(&lock->taken, 1);
 	return (prev == 0)?0:EBUSY;
@@ -550,11 +615,27 @@ int starpu_pthread_spin_unlock(starpu_pthread_spinlock_t *lock)
 {
 #ifdef STARPU_SIMGRID
 	lock->taken = 0;
-	return 0;
-#else
+#elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+	STARPU_ASSERT(lock->taken != 0);
+	unsigned next = STARPU_ATOMIC_ADD(&lock->taken, -1);
+	if (next == 0)
+		/* Nobody to wake, we are done */
+		return 0;
+
+	/*
+	 * Somebody to wake. Clear 'taken' and wake him.
+	 * Note that he may not be sleeping yet, but if he is not, we won't
+	 * since the value of 'taken' will have changed.
+	 */
+	lock->taken = 0;
+	STARPU_SYNCHRONIZE();
+	if (syscall(SYS_futex, &lock->taken, _starpu_futex_wake, 1, NULL, NULL, 0))
+		if (errno == ENOSYS)
+			_starpu_futex_wake = FUTEX_WAKE;
+#else /* !SIMGRID && !LINUX */
 	STARPU_RELEASE(&lock->taken);
-	return 0;
 #endif
+	return 0;
 }
 
 #endif /* defined(STARPU_SIMGRID) || !defined(HAVE_PTHREAD_SPIN_LOCK) */
@@ -564,6 +645,9 @@ int _starpu_pthread_spin_checklocked(starpu_pthread_spinlock_t *lock)
 #ifdef STARPU_SIMGRID
 	STARPU_ASSERT(lock->taken);
 	return !lock->taken;
+#elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)
+	STARPU_ASSERT(lock->taken == 1 || lock->taken == 2);
+	return lock->taken == 0;
 #elif defined(HAVE_PTHREAD_SPIN_LOCK)
 	int ret = pthread_spin_trylock((pthread_spinlock_t *)lock);
 	STARPU_ASSERT(ret != 0);

+ 6 - 6
src/core/dependencies/data_concurrency.c

@@ -209,7 +209,7 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 {
 	unsigned buf;
 
-	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	{
 		if (buf)
@@ -241,18 +241,18 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 {
 	struct starpu_codelet *cl = j->task->cl;
 
-	if ((cl == NULL) || (cl->nbuffers == 0))
+	if ((cl == NULL) || (STARPU_TASK_GET_NBUFFERS(j->task) == 0))
 		return 0;
 
 	/* Compute an ordered list of the different pieces of data so that we
 	 * grab then according to a total order, thus avoiding a deadlock
 	 * condition */
 	unsigned i;
-	for (i=0 ; i<cl->nbuffers ; i++)
+	for (i=0 ; i<STARPU_TASK_GET_NBUFFERS(j->task); i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		int node = -1;
 		if (j->task->cl->specific_nodes)
@@ -260,7 +260,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 
-	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);
+	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), STARPU_TASK_GET_NBUFFERS(j->task));
 
 	return _submit_job_enforce_data_deps(j, 0);
 }
@@ -268,7 +268,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 {
 	struct _starpu_job *j = r->j;
-	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 	unsigned buffer_index = r->buffer_index;
 
 	if (buffer_index + 1 < nbuffers)

+ 67 - 80
src/core/dependencies/implicit_data_deps.c

@@ -47,13 +47,14 @@ static void _starpu_add_dependency(starpu_data_handle_t handle STARPU_ATTRIBUTE_
 }
 
 /* Add pre_sync_task as new accessor among the existing ones, making it depend on the last synchronization task if any.  */
-static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot)
 {
 	/* Add this task to the list of readers */
-	struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
-	link->task = post_sync_task;
-	link->next = handle->last_submitted_accessors;
-	handle->last_submitted_accessors = link;
+	post_sync_task_dependency_slot->task = post_sync_task;
+	post_sync_task_dependency_slot->next = handle->last_submitted_accessors.next;
+	post_sync_task_dependency_slot->prev = &handle->last_submitted_accessors;
+	post_sync_task_dependency_slot->next->prev = post_sync_task_dependency_slot;
+	handle->last_submitted_accessors.next = post_sync_task_dependency_slot;
 
 	/* This task depends on the previous synchronization task if any */
 	if (handle->last_sync_task && handle->last_sync_task != post_sync_task)
@@ -103,9 +104,9 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 {
 	/* Count the existing accessors */
 	unsigned naccessors = 0;
-	struct _starpu_task_wrapper_list *l;
-	l = handle->last_submitted_accessors;
-	while (l)
+	struct _starpu_task_wrapper_dlist *l;
+	l = handle->last_submitted_accessors.next;
+	while (l != &handle->last_submitted_accessors)
 	{
 		if (l->task != post_sync_task)
 			naccessors++;
@@ -118,8 +119,8 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 		/* Put all tasks in the list into task_array */
 		struct starpu_task *task_array[naccessors];
 		unsigned i = 0;
-		l = handle->last_submitted_accessors;
-		while (l)
+		l = handle->last_submitted_accessors.next;
+		while (l != &handle->last_submitted_accessors)
 		{
 			STARPU_ASSERT(l->task);
 			if (l->task != post_sync_task)
@@ -129,9 +130,10 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 				_STARPU_DEP_DEBUG("dep %p -> %p\n", l->task, pre_sync_task);
 			}
 
-			struct _starpu_task_wrapper_list *prev = l;
+			struct _starpu_task_wrapper_dlist *prev = l;
 			l = l->next;
-			free(prev);
+			prev->next = NULL;
+			prev->prev = NULL;
 		}
 		_starpu_task_declare_deps_array(pre_sync_task, naccessors, task_array, 0);
 	}
@@ -156,7 +158,8 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 		handle->last_submitted_ghost_accessors_id = NULL;
 	}
 
-	handle->last_submitted_accessors = NULL;
+	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 	handle->last_sync_task = post_sync_task;
 
 	if (!post_sync_task->cl) {
@@ -177,7 +180,7 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
  * */
 /* NB : handle->sequential_consistency_mutex must be hold by the caller;
  * returns a task, to be submitted after releasing that mutex. */
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 						   starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	struct starpu_task *task = NULL;
@@ -228,15 +231,16 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		{
 			_STARPU_DEP_DEBUG("concurrently\n");
 			/* Can access concurrently with current tasks */
-			_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+			_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 		}
 		else
 		{
 			/* Can not access concurrently, have to wait for existing accessors */
-			struct _starpu_task_wrapper_list *l = handle->last_submitted_accessors;
+			struct _starpu_task_wrapper_dlist *l = handle->last_submitted_accessors.next;
 			_STARPU_DEP_DEBUG("dependency\n");
 
-			if ((l && l->next) || (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next))
+			if ((l != &handle->last_submitted_accessors && l->next != &handle->last_submitted_accessors)
+					|| (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next))
 			{
 				/* Several previous accessors */
 
@@ -261,7 +265,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					/* Make this task wait for the previous ones */
 					_starpu_add_sync_task(handle, sync_task, sync_task);
 					/* And the requested task wait for this one */
-					_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+					_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 
 					task = sync_task;
 				}
@@ -270,11 +274,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 			{
 				/* One previous accessor, make it the sync
 				 * task, and start depending on it. */
-				if (l)
+				if (l != &handle->last_submitted_accessors)
 				{
 					handle->last_sync_task = l->task;
-					handle->last_submitted_accessors = NULL;
-					free(l);
+					l->next = NULL;
+					l->prev = NULL;
+					handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+					handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 				}
 				else if (handle->last_submitted_ghost_accessors_id)
 				{
@@ -283,7 +289,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					free(handle->last_submitted_ghost_accessors_id);
 					handle->last_submitted_ghost_accessors_id = NULL;
 				}
-				_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+				_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 			}
 		}
 		handle->last_submitted_mode = mode;
@@ -307,13 +313,14 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	if (j->reduction_task)
 		return;
 
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	struct _starpu_task_wrapper_dlist *dep_slots = _STARPU_JOB_GET_DEP_SLOTS(j);
 
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 		struct starpu_task *new_task;
 
 		/* Scratch memory does not introduce any deps */
@@ -321,7 +328,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 			continue;
 
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, &dep_slots[buffer], handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
 		{
@@ -341,7 +348,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
  * if h is submitted after the termination of f or g, StarPU will not create a
  * dependency as this is not needed anymore. */
 /* the sequential_consistency_mutex of the handle has to be already held */
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle)
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, struct _starpu_task_wrapper_dlist *task_dependency_slot, starpu_data_handle_t handle)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 
@@ -365,63 +372,35 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 			}
 		}
 
-		/* XXX can a task be both the last writer associated to a data
-		 * and be in its list of readers ? If not, we should not go
-		 * through the entire list once we have detected it was the
-		 * last writer. */
-
 		/* Same if this is one of the readers: we go through the list
 		 * of readers and remove the task if it is found. */
-		struct _starpu_task_wrapper_list *l;
-		l = handle->last_submitted_accessors;
-		struct _starpu_task_wrapper_list *prev = NULL;
-#ifdef STARPU_DEVEL
-#warning TODO: use double-linked list to make finding ourself fast
-#endif
-		while (l)
+		if (task_dependency_slot && task_dependency_slot->next)
 		{
-			struct _starpu_task_wrapper_list *next = l->next;
-
-			if (l->task == task)
-			{
-				/* If we found the task in the reader list */
-				free(l);
+#ifdef STARPU_DEBUG
+			/* Make sure we are removing ourself from the proper handle */
+			struct _starpu_task_wrapper_dlist *l;
+			for (l = task_dependency_slot->prev; l->task; l = l->prev)
+				;
+			STARPU_ASSERT(l == &handle->last_submitted_accessors);
+			for (l = task_dependency_slot->next; l->task; l = l->next)
+				;
+			STARPU_ASSERT(l == &handle->last_submitted_accessors);
+#endif
 
+			task_dependency_slot->next->prev = task_dependency_slot->prev;
+			task_dependency_slot->prev->next = task_dependency_slot->next;
 #ifndef STARPU_USE_FXT
-				if (_starpu_bound_recording)
+			if (_starpu_bound_recording)
 #endif
-				{
-					/* Save the job id of the reader task in the ghost reader linked list list */
-					struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
-					struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
-					STARPU_ASSERT(link);
-					link->next = handle->last_submitted_ghost_accessors_id;
-					link->id = ghost_reader_job->job_id;
-					handle->last_submitted_ghost_accessors_id = link;
-				}
-
-				if (prev)
-				{
-					prev->next = next;
-				}
-				else
-				{
-					/* This is the first element of the list */
-					handle->last_submitted_accessors = next;
-				}
-
-				/* XXX can we really find the same task again
-				 * once we have found it ? Otherwise, we should
-				 * avoid going through the entire list and stop
-				 * as soon as we find the task. TODO: check how
-				 * duplicate dependencies are treated. */
-			}
-			else
 			{
-				prev = l;
+				/* Save the job id of the reader task in the ghost reader linked list list */
+				struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
+				struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
+				STARPU_ASSERT(link);
+				link->next = handle->last_submitted_ghost_accessors_id;
+				link->id = ghost_reader_job->job_id;
+				handle->last_submitted_ghost_accessors_id = link;
 			}
-
-			l = next;
 		}
 	}
 
@@ -434,13 +413,22 @@ void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
         struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
+	struct _starpu_task_wrapper_dlist *slots = _STARPU_JOB_GET_DEP_SLOTS(j);
 
 	if (!task->cl)
 		return;
 
-        unsigned nbuffers = task->cl->nbuffers;
-
+        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
+
+	/* Release all implicit dependencies */
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
+
+		_starpu_release_data_enforce_sequential_consistency(task, &slots[index], handle);
+	}
+
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = descrs[index].handle;
@@ -451,7 +439,6 @@ void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 			 * _starpu_compar_handles */
 			continue;
 
-		_starpu_release_data_enforce_sequential_consistency(task, handle);
 		/* Release the reference acquired in _starpu_push_task_output */
 		_starpu_spin_lock(&handle->header_lock);
 		STARPU_ASSERT(handle->busy_count > 0);
@@ -512,7 +499,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 		while (link)
 		{
 			/* There is no need to depend on that task now, since it was already unlocked */
-			_starpu_release_data_enforce_sequential_consistency(link->task, handle);
+			_starpu_release_data_enforce_sequential_consistency(link->task, &_starpu_get_job_associated_to_task(link->task)->implicit_dep_slot, handle);
 
 			int ret = _starpu_task_submit_internally(link->task);
 			STARPU_ASSERT(!ret);
@@ -540,7 +527,7 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */
-		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, &_starpu_get_job_associated_to_task(sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task)

+ 2 - 2
src/core/dependencies/implicit_data_deps.h

@@ -21,10 +21,10 @@
 #include <starpu.h>
 #include <common/config.h>
 
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 						   starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, struct _starpu_task_wrapper_dlist *task_dependency_slot, starpu_data_handle_t handle);
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);

+ 25 - 14
src/core/jobs.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  *
@@ -53,7 +53,10 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 	memset(job, 0, sizeof(*job));
 
 	if (task->dyn_handles)
-	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(job->dyn_ordered_buffers[0]));
+	{
+	     job->dyn_ordered_buffers = malloc(STARPU_TASK_GET_NBUFFERS(task) * sizeof(job->dyn_ordered_buffers[0]));
+	     job->dyn_dep_slots = malloc(STARPU_TASK_GET_NBUFFERS(task) * sizeof(job->dyn_dep_slots[0]));
+	}
 
 	job->task = task;
 
@@ -109,8 +112,13 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	_starpu_cg_list_deinit(&j->job_successors);
 	if (j->dyn_ordered_buffers)
 	{
-	     free(j->dyn_ordered_buffers);
-	     j->dyn_ordered_buffers = NULL;
+		free(j->dyn_ordered_buffers);
+		j->dyn_ordered_buffers = NULL;
+	}
+	if (j->dyn_dep_slots)
+	{
+		free(j->dyn_dep_slots);
+		j->dyn_dep_slots = NULL;
 	}
 
 	_starpu_job_delete(j);
@@ -166,22 +174,25 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 #ifdef STARPU_USE_SC_HYPERVISOR
-	int workerid = starpu_worker_get_id();
-	int i;
 	size_t data_size = 0;
-	for(i = 0; i < STARPU_NMAXBUFS; i++)
-	{
-		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
-		if (handle != NULL)
-			data_size += _starpu_data_get_size(handle);
-	}
+	int workerid = starpu_worker_get_id();
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	/* We release handle reference count */
 	if (task->cl)
 	{
 		unsigned i;
-		for (i=0; i<task->cl->nbuffers; i++)
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+#ifdef STARPU_USE_SC_HYPERVISOR
+		for(i = 0; i < nbuffers; i++)
+		{
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
+			if (handle != NULL)
+				data_size += _starpu_data_get_size(handle);
+		}
+#endif //STARPU_USE_SC_HYPERVISOR
+
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
@@ -199,7 +210,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	 * tasks waiting for us */
 	if (j->implicit_dep_handle) {
 		starpu_data_handle_t handle = j->implicit_dep_handle;
-		_starpu_release_data_enforce_sequential_consistency(j->task, handle);
+		_starpu_release_data_enforce_sequential_consistency(j->task, &j->implicit_dep_slot, handle);
 		/* Release reference taken while setting implicit_dep_handle */
 		_starpu_spin_lock(&handle->header_lock);
 		handle->busy_count--;

+ 16 - 6
src/core/jobs.h

@@ -77,7 +77,9 @@ LIST_TYPE(_starpu_job,
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
 	struct _starpu_data_descr ordered_buffers[STARPU_NMAXBUFS];
+	struct _starpu_task_wrapper_dlist dep_slots[STARPU_NMAXBUFS];
 	struct _starpu_data_descr *dyn_ordered_buffers;
+	struct _starpu_task_wrapper_dlist *dyn_dep_slots;
 
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
@@ -91,22 +93,28 @@ LIST_TYPE(_starpu_job,
 	 * the handle for this dependency, so as to remove the task from the
 	 * last_writer/readers */
 	starpu_data_handle_t implicit_dep_handle;
-
-	/* The value of the footprint that identifies the job may be stored in
-	 * this structure. */
-	uint32_t footprint;
-	unsigned footprint_is_computed:1;
+	struct _starpu_task_wrapper_dlist implicit_dep_slot;
 
 	/* Indicates whether the task associated to that job has already been
 	 * submitted to StarPU (1) or not (0) (using starpu_task_submit).
 	 * Becomes and stays 2 when the task is submitted several times.
+	 *
+	 * Protected by j->sync_mutex.
 	 */
 	unsigned submitted:2;
 
 	/* Indicates whether the task associated to this job is terminated or
-	 * not. */
+	 * not.
+	 *
+	 * Protected by j->sync_mutex.
+	 */
 	unsigned terminated:2;
 
+	/* The value of the footprint that identifies the job may be stored in
+	 * this structure. */
+	uint32_t footprint;
+	unsigned footprint_is_computed:1;
+
 	/* Should that task appear in the debug tools ? (eg. the DAG generated
 	 * with dot) */
 	unsigned exclude_from_dag:1;
@@ -193,4 +201,6 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 #define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
 #define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers
 
+#define _STARPU_JOB_GET_DEP_SLOTS(job) (((job)->dyn_dep_slots) ? (job)->dyn_dep_slots : (job)->dep_slots)
+
 #endif // __JOBS_H__

+ 14 - 30
src/core/perfmodel/perfmodel.c

@@ -70,19 +70,13 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
-	double exp = NAN;
 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
-	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
 	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
-	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
 
-	if (per_arch_cost_function)
-		exp = per_arch_cost_function(task, arch, nimpl);
-	else if (per_arch_cost_model)
-		exp = per_arch_cost_model(task->buffers);
+	STARPU_ASSERT_MSG(per_arch_cost_function, "STARPU_PER_ARCH needs per-arch cost_function to be defined");
 
-	return exp;
+	return per_arch_cost_function(task, arch, nimpl);
 }
 
 /*
@@ -118,26 +112,14 @@ static double common_task_expected_perf(struct starpu_perfmodel *model, struct s
 	double exp;
 	double alpha;
 
-	if (model->cost_function)
-	{
-		exp = model->cost_function(task, nimpl);
-		alpha = starpu_worker_get_relative_speedup(arch);
-
-		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
+	STARPU_ASSERT_MSG(model->cost_function, "STARPU_COMMON requires common cost_function to be defined");
 
-		return (exp/alpha);
-	}
-	else if (model->cost_model)
-	{
-		exp = model->cost_model(task->buffers);
-		alpha = starpu_worker_get_relative_speedup(arch);
+	exp = model->cost_function(task, nimpl);
+	alpha = starpu_worker_get_relative_speedup(arch);
 
-		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
+	STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
 
-		return (exp/alpha);
-	}
-
-	return NAN;
+	return (exp/alpha);
 }
 
 void _starpu_load_perfmodel(struct starpu_perfmodel *model)
@@ -226,8 +208,9 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	unsigned i;
 	double sum = 0.0;
 	enum starpu_node_kind node_kind;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-	for (i = 0; i < task->cl->nbuffers; i++)
+	for (i = 0; i < nbuffers; i++)
 	{
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
@@ -304,7 +287,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 /* Data transfer performance modeling */
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task)
 {
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned buffer;
 
 	double penalty = 0.0;
@@ -312,7 +295,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
@@ -397,10 +380,11 @@ double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundl
 		if (task->cl)
 		{
 			unsigned b;
-			for (b = 0; b < task->cl->nbuffers; b++)
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+			for (b = 0; b < nbuffers; b++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, b);
-				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, b);
+				enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, b);
 
 				if (!(mode & STARPU_R))
 					continue;

+ 3 - 2
src/core/perfmodel/perfmodel_history.c

@@ -66,7 +66,7 @@ size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_p
 	}
 	else
 	{
-		unsigned nbuffers = task->cl->nbuffers;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 		size_t size = 0;
 
 		unsigned buffer;
@@ -1420,8 +1420,9 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
 		unsigned i;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-		for (i = 0; i < task->cl->nbuffers; i++)
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 

+ 5 - 5
src/core/perfmodel/perfmodel_print.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -194,7 +194,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 	{
 		if (strcmp(arch, "cpu") == 0)
 		{
-			unsigned implid;
+			int implid;
 			struct starpu_perfmodel_arch perf_arch;
 			perf_arch.type = STARPU_CPU_WORKER;
 			perf_arch.devid = 0;
@@ -214,7 +214,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 				exit(-1);
 			}
 
-			unsigned implid;
+			int implid;
 			struct starpu_perfmodel_arch perf_arch;
 			perf_arch.type = STARPU_CPU_WORKER;
 			perf_arch.devid = 0;
@@ -227,7 +227,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		if (strcmp(arch, "cuda") == 0)
 		{
 			unsigned devid;
-			unsigned implid;
+			int implid;
 			struct starpu_perfmodel_arch perf_arch;
 			perf_arch.type = STARPU_CUDA_WORKER;
 			perf_arch.ncore = 0;
@@ -251,7 +251,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			perf_arch.type = STARPU_CUDA_WORKER;
 			perf_arch.devid = gpuid;
 			perf_arch.ncore = 0;
-			unsigned implid;
+			int implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;

+ 32 - 11
src/core/sched_ctx.c

@@ -23,13 +23,13 @@ starpu_pthread_rwlock_t changing_ctx_mutex[STARPU_NMAX_SCHED_CTXS];
 
 static starpu_pthread_mutex_t sched_ctx_manag = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t finished_submit_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
+static struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 starpu_pthread_key_t sched_ctx_key;
-unsigned with_hypervisor = 0;
-double hyp_start_sample[STARPU_NMAX_SCHED_CTXS];
-double hyp_start_allow_sample[STARPU_NMAX_SCHED_CTXS];
-double flops[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
-size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
+static unsigned with_hypervisor = 0;
+static double hyp_start_sample[STARPU_NMAX_SCHED_CTXS];
+static double hyp_start_allow_sample[STARPU_NMAX_SCHED_CTXS];
+static double flops[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
+static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
@@ -47,7 +47,7 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 		worker->nsched_ctxs++;
 	}
 	worker->removed_from_ctx[sched_ctx_id] = 0;
-	if(worker->tmp_sched_ctx == sched_ctx_id)
+	if(worker->tmp_sched_ctx == (int) sched_ctx_id)
 		worker->tmp_sched_ctx = -1;
 	return;
 }
@@ -1641,10 +1641,10 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
 }
 
-void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_flops)
 {
         _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
-        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, ready_flops);
 }
 
 void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
@@ -1788,7 +1788,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	int w = 0;
 
-	struct _starpu_worker *worker = NULL;
 	struct starpu_sched_ctx_iterator it;
 	int workerid;
 	if(workers->init_iterator)
@@ -1814,7 +1813,6 @@ static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *
 
 	int masters[nworkers];
 	int w;
-	struct _starpu_worker *worker = NULL;
 	for(w = 0; w < nworkers; w++)
 	{
 		int workerid = workerids[w];
@@ -1951,3 +1949,26 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 	/* wake up starpu workers */
 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
 }
+
+int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
+{
+	int idx = 0;
+	int curr_workerid = starpu_worker_get_id();
+	int worker;
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if(worker == curr_workerid)
+			return idx;
+		idx++;
+	}
+
+	return -1;
+}

+ 6 - 4
src/core/sched_policy.c

@@ -272,7 +272,8 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		unsigned node = starpu_worker_get_memory_node(workerid);
 		if (_starpu_task_uses_multiformat_handles(task))
 		{
-			for (i = 0; i < task->cl->nbuffers; i++)
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+			for (i = 0; i < nbuffers; i++)
 			{
 				struct starpu_task *conversion_task;
 				starpu_data_handle_t handle;
@@ -289,7 +290,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 				//_STARPU_DEBUG("Pushing a conversion task\n");
 			}
 
-			for (i = 0; i < task->cl->nbuffers; i++)
+			for (i = 0; i < nbuffers; i++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 				handle->mf_node = node;
@@ -618,7 +619,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		STARPU_ABORT();
 	}
 
-	STARPU_CODELET_SET_MODE(conversion_task->cl, STARPU_RW, 0);
+	STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0);
 	return conversion_task;
 }
 
@@ -841,7 +842,8 @@ pick:
 	 * required conversion tasks.
 	 */
 	unsigned i;
-	for (i = 0; i < task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i = 0; i < nbuffers; i++)
 	{
 		struct starpu_task *conversion_task;
 		starpu_data_handle_t handle;

+ 25 - 33
src/core/task.c

@@ -262,11 +262,15 @@ int _starpu_submit_job(struct _starpu_job *j)
 		_starpu_compute_buffers_footprint(j->task->cl->model, &arch, 0, j);
 		int i;
 		size_t data_size = 0;
-		for(i = 0; i < STARPU_NMAXBUFS; i++)
+		if (j->task->cl)
 		{
-			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
-			if (handle != NULL)
-				data_size += _starpu_data_get_size(handle);
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
+			for(i = 0; i < nbuffers; i++)
+			{
+				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
+				if (handle != NULL)
+					data_size += _starpu_data_get_size(handle);
+			}
 		}
 
 		_STARPU_TRACE_HYPERVISOR_BEGIN();
@@ -279,7 +283,8 @@ int _starpu_submit_job(struct _starpu_job *j)
 	if (task->cl)
 	{
 		unsigned i;
-		for (i=0; i<task->cl->nbuffers; i++)
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+		for (i=0; i<nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
@@ -392,26 +397,9 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	}
 }
 
-void _starpu_task_check_deprecated_fields(struct starpu_task *task)
+void _starpu_task_check_deprecated_fields(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED)
 {
-	if (task->cl)
-	{
-		unsigned i;
-		for(i=0; i<STARPU_MIN(task->cl->nbuffers, STARPU_NMAXBUFS) ; i++)
-		{
-			if (task->buffers[i].handle && task->handles[i])
-			{
-				_STARPU_DISP("[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
-				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
-				STARPU_ABORT();
-			}
-			if (task->buffers[i].handle)
-			{
-				task->handles[i] = task->buffers[i].handle;
-				task->cl->modes[i] = task->buffers[i].mode;
-			}
-		}
-	}
+	/* None any more */
 }
 
 /* application should submit new tasks to StarPU through this function */
@@ -455,17 +443,18 @@ int starpu_task_submit(struct starpu_task *task)
 	if (task->cl)
 	{
 		unsigned i;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 		/* Check buffers */
 		if (task->dyn_handles == NULL)
-			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+			STARPU_ASSERT_MSG(STARPU_TASK_GET_NBUFFERS(task) <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
 
 		if (task->dyn_handles)
 		{
-			task->dyn_interfaces = malloc(task->cl->nbuffers * sizeof(void *));
+			task->dyn_interfaces = malloc(nbuffers * sizeof(void *));
 		}
 
-		for (i = 0; i < task->cl->nbuffers; i++)
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are not partitioned */
@@ -606,11 +595,12 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 	{
 		/* This would be done by data dependencies checking */
 		unsigned i;
-		for (i=0 ; i<task->cl->nbuffers ; i++)
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+		for (i=0 ; i<nbuffers ; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+			enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 			int node = -1;
 			if (j->task->cl->specific_nodes)
@@ -645,7 +635,8 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	/* We retain handle reference count */
 	unsigned i;
-	for (i=0; i<task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i=0; i<nbuffers; i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 		_starpu_spin_lock(&handle->header_lock);
@@ -670,11 +661,11 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	j->submitted = 1;
 	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops);
-	for (i=0 ; i<task->cl->nbuffers ; i++)
+	for (i=0 ; i<nbuffers ; i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		int node = -1;
 		if (j->task->cl->specific_nodes)
@@ -905,7 +896,8 @@ int
 _starpu_task_uses_multiformat_handles(struct starpu_task *task)
 {
 	unsigned i;
-	for (i = 0; i < task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i = 0; i < nbuffers; i++)
 	{
 		if (_starpu_data_is_multiformat_handle(STARPU_TASK_GET_HANDLE(task, i)))
 			return 1;

+ 11 - 1
src/core/workers.c

@@ -863,7 +863,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 #endif
 
 	/* 64MiB by default */
-	conf->trace_buffer_size = 64<<20;
+	conf->trace_buffer_size = starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64) << 20;
 	return 0;
 }
 
@@ -1987,3 +1987,13 @@ unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 	return nsched_ctxs;
 }
 
+char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
+{
+	if (type == STARPU_CPU_WORKER) return "STARPU_CPU_WORKER";
+	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
+	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
+	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
+	if (type == STARPU_SCC_WORKER) return "STARPU_SCC_WORKER";
+	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
+	return "STARPU_unknown_WORKER";
+}

+ 7 - 7
src/datawizard/coherency.c

@@ -57,7 +57,7 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	}
 
 	/* we should have found at least one copy ! */
-	STARPU_ASSERT(src_node_mask != 0);
+	STARPU_ASSERT_MSG(src_node_mask != 0, "The data for this handle is requested, but this handle does not have a valid value. Perhaps some initialization task is missing?");
 
 	/* Without knowing the size, we won't know the cost */
 	if (!size)
@@ -679,13 +679,13 @@ static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handl
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 {
 	STARPU_ASSERT(!task->prefetched);
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
@@ -719,14 +719,14 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
 	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
 
 	int workerid = starpu_worker_get_id();
 
 #ifdef STARPU_USE_FXT
-	unsigned total_size = 0;
+	unsigned long total_size = 0;
 #endif
 
 	unsigned index;
@@ -763,7 +763,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
 		int node = descrs[index].node;
 		if (node == -1)
 			node = local_memory_node;
@@ -826,7 +826,7 @@ void _starpu_push_task_output(struct _starpu_job *j)
 		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
         struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
-        unsigned nbuffers = task->cl->nbuffers;
+        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 	int workerid = starpu_worker_get_id();
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();

+ 8 - 1
src/datawizard/coherency.h

@@ -97,6 +97,13 @@ struct _starpu_task_wrapper_list
 	struct _starpu_task_wrapper_list *next;
 };
 
+/* This structure describes a doubly-linked list of task */
+struct _starpu_task_wrapper_dlist {
+	struct starpu_task *task;
+	struct _starpu_task_wrapper_dlist *next;
+	struct _starpu_task_wrapper_dlist *prev;
+};
+
 extern int _starpu_has_not_important_data;
 
 typedef void (*_starpu_data_handle_unregister_hook)(starpu_data_handle_t);
@@ -169,7 +176,7 @@ struct _starpu_data_state
 	 * sequential_consistency flag is enabled. */
 	enum starpu_data_access_mode last_submitted_mode;
 	struct starpu_task *last_sync_task;
-	struct _starpu_task_wrapper_list *last_submitted_accessors;
+	struct _starpu_task_wrapper_dlist last_submitted_accessors;
 
 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
 	 * say the dependencies that are not needed anymore, but that should

+ 12 - 6
src/datawizard/data_request.c

@@ -424,10 +424,10 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, uns
 
 	*pushed = 0;
 
-	/* Here helgrind would should that this is an un protected access.
-	 * We however don't care about missing an entry, we will get called
-	 * again sooner or later. */
-	if (_starpu_data_request_list_empty(data_requests[src_node]))
+	/* This is racy, but not posing problems actually, since we know we
+	 * will come back here to probe again regularly anyway.
+	 * Thus, do not expose this optimization to helgrind */
+	if (!RUNNING_ON_VALGRIND && _starpu_data_request_list_empty(data_requests[src_node]))
 		return 0;
 
 	empty_list = _starpu_data_request_list_new();
@@ -513,7 +513,10 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 
 	*pushed = 0;
 
-	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
+	/* This is racy, but not posing problems actually, since we know we
+	 * will come back here to probe again regularly anyway.
+	 * Thus, do not expose this optimization to valgrind */
+	if (!RUNNING_ON_VALGRIND && _starpu_data_request_list_empty(prefetch_requests[src_node]))
 		return;
 
 	empty_list = _starpu_data_request_list_new();
@@ -606,7 +609,10 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 	struct _starpu_data_request_list *empty_list;
 	unsigned taken, kept;
 
-	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
+	/* Here helgrind would should that this is an un protected access.
+	 * We however don't care about missing an entry, we will get called
+	 * again sooner or later. */
+	if (!RUNNING_ON_VALGRIND && _starpu_data_request_list_empty(data_requests_pending[src_node]))
 		return 0;
 
 	empty_list = _starpu_data_request_list_new();

+ 3 - 1
src/datawizard/filters.c

@@ -190,7 +190,9 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		child->last_submitted_mode = STARPU_R;
 		child->last_sync_task = NULL;
-		child->last_submitted_accessors = NULL;
+		child->last_submitted_accessors.task = NULL;
+		child->last_submitted_accessors.next = &child->last_submitted_accessors;
+		child->last_submitted_accessors.prev = &child->last_submitted_accessors;
 		child->post_sync_tasks = NULL;
 		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
 		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);

+ 2 - 1
src/datawizard/footprint.c

@@ -24,8 +24,9 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 {
 	uint32_t footprint = 0;
 	unsigned buffer;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
+	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 

+ 16 - 3
src/datawizard/interfaces/bcsr_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node);
 static size_t bcsr_interface_get_size(starpu_data_handle_t handle);
 static int bcsr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 
 struct starpu_data_interface_ops starpu_interface_bcsr_ops =
@@ -56,7 +57,8 @@ struct starpu_data_interface_ops starpu_interface_bcsr_ops =
 	.interfaceid = STARPU_BCSR_INTERFACE_ID,
 	.interface_size = sizeof(struct starpu_bcsr_interface),
 	.footprint = footprint_bcsr_interface_crc32,
-	.compare = bcsr_compare
+	.compare = bcsr_compare,
+	.describe = describe
 };
 
 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
@@ -324,3 +326,14 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	return ret;
 }
+
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_bcsr_interface *bcsr = (struct starpu_bcsr_interface *) data_interface;
+	return snprintf(buf, size, "b%ux%ux%ux%ux%u",
+			(unsigned) bcsr->nnz,
+			(unsigned) bcsr->nrow,
+			(unsigned) bcsr->r,
+			(unsigned) bcsr->c,
+			(unsigned) bcsr->elemsize);
+}

+ 27 - 3
src/datawizard/interfaces/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -77,6 +77,7 @@ static int block_compare(void *data_interface_a, void *data_interface_b);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_block_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
 static int unpack_block_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_block_ops =
 {
@@ -92,7 +93,8 @@ struct starpu_data_interface_ops starpu_interface_block_ops =
 	.interface_size = sizeof(struct starpu_block_interface),
 	.display = display_block_interface,
 	.pack_data = pack_block_handle,
-	.unpack_data = unpack_block_handle
+	.unpack_data = unpack_block_handle,
+	.describe = describe
 };
 
 static void *block_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -167,6 +169,18 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, unsigned home_n
 	starpu_data_register(handleptr, home_node, &block_interface, &starpu_interface_block_ops);
 }
 
+void starpu_block_ptr_register(starpu_data_handle_t handle, unsigned node,
+			uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ldy, uint32_t ldz)
+{
+	struct starpu_block_interface *block_interface = starpu_data_get_interface_on_node(handle, node);
+	starpu_data_ptr_register(handle, node);
+	block_interface->ptr = ptr;
+	block_interface->dev_handle = dev_handle;
+	block_interface->offset = offset;
+	block_interface->ldy = ldy;
+	block_interface->ldz = ldz;
+}
+
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
 {
 	uint32_t hash;
@@ -716,3 +730,13 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	return ret;
 }
+
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *) data_interface;
+	return snprintf(buf, size, "B%ux%ux%ux%u",
+			(unsigned) block->nx,
+			(unsigned) block->ny,
+			(unsigned) block->nz,
+			(unsigned) block->elemsize);
+}

+ 12 - 1
src/datawizard/interfaces/coo_interface.c

@@ -190,6 +190,16 @@ display_coo_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 }
 
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_coo_interface *coo = (struct starpu_coo_interface *) data_interface;
+	return snprintf(buf, size, "M%ux%ux%ux%u",
+			(unsigned) coo->nx,
+			(unsigned) coo->ny,
+			(unsigned) coo->n_values,
+			(unsigned) coo->elemsize);
+}
+
 struct starpu_data_interface_ops starpu_interface_coo_ops =
 {
 	.register_data_handle  = register_coo_handle,
@@ -202,7 +212,8 @@ struct starpu_data_interface_ops starpu_interface_coo_ops =
 	.compare               = coo_compare,
 	.interfaceid           = STARPU_COO_INTERFACE_ID,
 	.interface_size        = sizeof(struct starpu_coo_interface),
-	.display               = display_coo_interface
+	.display               = display_coo_interface,
+	.describe              = describe
 };
 
 void

+ 13 - 2
src/datawizard/interfaces/csr_interface.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,6 +43,7 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node);
 static size_t csr_interface_get_size(starpu_data_handle_t handle);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_csr_ops =
 {
@@ -55,6 +56,7 @@ struct starpu_data_interface_ops starpu_interface_csr_ops =
 	.interface_size = sizeof(struct starpu_csr_interface),
 	.footprint = footprint_csr_interface_crc32,
 	.compare = csr_compare,
+	.describe = describe
 };
 
 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
@@ -290,3 +292,12 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	return ret;
 }
+
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_csr_interface *csr = (struct starpu_csr_interface *) data_interface;
+	return snprintf(buf, size, "C%ux%ux%u",
+			(unsigned) csr->nnz,
+			(unsigned) csr->nrow,
+			(unsigned) csr->elemsize);
+}

+ 31 - 3
src/datawizard/interfaces/data_interface.c

@@ -62,6 +62,11 @@ void _starpu_data_interface_shutdown()
 {
 	struct handle_entry *entry, *tmp;
 
+	if (registered_handles)
+	{
+		_STARPU_DISP("[warning] The application has not unregistered all data handles.\n");
+	}
+
 	_starpu_spin_destroy(&registered_handles_lock);
 
 	HASH_ITER(hh, registered_handles, entry, tmp)
@@ -200,7 +205,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
 	handle->last_submitted_mode = STARPU_R;
 	handle->last_sync_task = NULL;
-	handle->last_submitted_accessors = NULL;
+	handle->last_submitted_accessors.task = NULL;
+	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 	handle->post_sync_tasks = NULL;
 
 	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
@@ -295,6 +302,17 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	}
 }
 
+void starpu_data_ptr_register(starpu_data_handle_t handle, unsigned node)
+{
+	struct _starpu_data_replicate *replicate = &handle->per_node[node];
+
+	_starpu_spin_lock(&handle->header_lock);
+	STARPU_ASSERT_MSG(replicate->allocated == 0, "starpu_data_ptr_register must be called right after starpu_data_register");
+	replicate->allocated = 1;
+	replicate->automatically_allocated = 0;
+	_starpu_spin_unlock(&handle->header_lock);
+}
+
 int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_interface_ops *interface_ops, unsigned int mf_node)
 {
 	unsigned node;
@@ -406,7 +424,12 @@ int _starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 
 int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 {
-	_STARPU_DISP("Warning: You should call starpu_mpi_data_register which will insure MPI cache will be cleared when unregistering the data\n");
+	static int first=1;
+	if (first)
+	{
+		_STARPU_DISP("Warning: You should call starpu_mpi_data_register which will insure MPI cache will be cleared when unregistering the data\n");
+		first=0;
+	}
 	return _starpu_data_set_rank(handle, rank);
 }
 
@@ -455,7 +478,12 @@ int _starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 
 int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 {
-	_STARPU_DISP("Warning: You should call starpu_mpi_data_register which will insure MPI cache will be cleared when unregistering the data\n");
+	static int first=1;
+	if (first)
+	{
+		_STARPU_DISP("Warning: You should call starpu_mpi_data_register which will insure MPI cache will be cleared when unregistering the data\n");
+		first=0;
+	}
 	return _starpu_data_set_tag(handle, tag);
 }
 

+ 25 - 3
src/datawizard/interfaces/matrix_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -90,6 +90,7 @@ static int matrix_compare(void *data_interface_a, void *data_interface_b);
 static void display_matrix_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_matrix_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
 static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_matrix_ops =
 {
@@ -105,7 +106,8 @@ struct starpu_data_interface_ops starpu_interface_matrix_ops =
 	.interface_size = sizeof(struct starpu_matrix_interface),
 	.display = display_matrix_interface,
 	.pack_data = pack_matrix_handle,
-	.unpack_data = unpack_matrix_handle
+	.unpack_data = unpack_matrix_handle,
+	.describe = describe
 };
 
 static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
@@ -176,6 +178,17 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
 	starpu_data_register(handleptr, home_node, &matrix_interface, &starpu_interface_matrix_ops);
 }
 
+void starpu_matrix_ptr_register(starpu_data_handle_t handle, unsigned node,
+			uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ld)
+{
+	struct starpu_matrix_interface *matrix_interface = starpu_data_get_interface_on_node(handle, node);
+	starpu_data_ptr_register(handle, node);
+	matrix_interface->ptr = ptr;
+	matrix_interface->dev_handle = dev_handle;
+	matrix_interface->offset = offset;
+	matrix_interface->ld = ld;
+}
+
 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle)
 {
 	return starpu_hash_crc32c_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
@@ -665,3 +678,12 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	return ret;
 }
+
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_matrix_interface *matrix = (struct starpu_matrix_interface *) data_interface;
+	return snprintf(buf, size, "M%ux%ux%u",
+			(unsigned) matrix->nx,
+			(unsigned) matrix->ny,
+			(unsigned) matrix->elemsize);
+}

+ 21 - 3
src/datawizard/interfaces/variable_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,7 @@ static int variable_compare(void *data_interface_a, void *data_interface_b);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_variable_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
 static int unpack_variable_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_variable_ops =
 {
@@ -59,7 +60,8 @@ struct starpu_data_interface_ops starpu_interface_variable_ops =
 	.interface_size = sizeof(struct starpu_variable_interface),
 	.display = display_variable_interface,
 	.pack_data = pack_variable_handle,
-	.unpack_data = unpack_variable_handle
+	.unpack_data = unpack_variable_handle,
+	.describe = describe
 };
 
 static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -117,6 +119,16 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, unsigned hom
 	starpu_data_register(handleptr, home_node, &variable, &starpu_interface_variable_ops);
 }
 
+void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node,
+			uintptr_t ptr, uintptr_t dev_handle, size_t offset)
+{
+	struct starpu_variable_interface *variable_interface = starpu_data_get_interface_on_node(handle, node);
+	starpu_data_ptr_register(handle, node);
+	variable_interface->ptr = ptr;
+	variable_interface->dev_handle = dev_handle;
+	variable_interface->offset = offset;
+}
+
 
 static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle)
 {
@@ -235,3 +247,9 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 
 	return ret;
 }
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_variable_interface *variable = (struct starpu_variable_interface *) data_interface;
+	return snprintf(buf, size, "v%u",
+			(unsigned) variable->elemsize);
+}

+ 22 - 3
src/datawizard/interfaces/vector_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,7 @@ static int vector_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_vector_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
 static int unpack_vector_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_vector_ops =
 {
@@ -59,7 +60,8 @@ struct starpu_data_interface_ops starpu_interface_vector_ops =
 	.interface_size = sizeof(struct starpu_vector_interface),
 	.display = display_vector_interface,
 	.pack_data = pack_vector_handle,
-	.unpack_data = unpack_vector_handle
+	.unpack_data = unpack_vector_handle,
+	.describe = describe
 };
 
 static void *vector_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
@@ -122,6 +124,16 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
 	starpu_data_register(handleptr, home_node, &vector, &starpu_interface_vector_ops);
 }
 
+void starpu_vector_ptr_register(starpu_data_handle_t handle, unsigned node,
+			uintptr_t ptr, uintptr_t dev_handle, size_t offset)
+{
+	struct starpu_vector_interface *vector_interface = starpu_data_get_interface_on_node(handle, node);
+	starpu_data_ptr_register(handle, node);
+	vector_interface->ptr = ptr;
+	vector_interface->dev_handle = dev_handle;
+	vector_interface->offset = offset;
+}
+
 
 static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle)
 {
@@ -273,3 +285,10 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 	return ret;
 }
 
+static ssize_t describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) data_interface;
+	return snprintf(buf, size, "V%ux%u",
+			(unsigned) vector->nx,
+			(unsigned) vector->elemsize);
+}

+ 9 - 2
src/datawizard/interfaces/void_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -41,6 +41,7 @@ static int void_compare(void *data_interface_a, void *data_interface_b);
 static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_void_handle(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count);
 static int unpack_void_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static ssize_t describe(void *data_interface, char *buf, size_t size);
 
 struct starpu_data_interface_ops starpu_interface_void_ops =
 {
@@ -55,7 +56,8 @@ struct starpu_data_interface_ops starpu_interface_void_ops =
 	.interface_size = 0,
 	.display = display_void_interface,
 	.pack_data = pack_void_handle,
-	.unpack_data = unpack_void_handle
+	.unpack_data = unpack_void_handle,
+	.describe = describe
 };
 
 static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED,
@@ -137,3 +139,8 @@ static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
 {
 	return 0;
 }
+
+static ssize_t describe(void *data_interface STARPU_ATTRIBUTE_UNUSED, char *buf, size_t size)
+{
+	return snprintf(buf, size, "0");
+}

+ 3 - 1
src/datawizard/memalloc.c

@@ -983,8 +983,10 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 
-	if (dst_node == STARPU_MAIN_RAM)
+	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
 	{
+		/* We are allocating the buffer in main memory, also register it
+		 * for the gcc plugin.  */
 		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
 		if (ptr != NULL)
 		{

+ 1 - 0
src/datawizard/memory_nodes.c

@@ -132,6 +132,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 		prefix = "SCC_shared";
 		break;
 	case STARPU_UNUSED:
+		prefix = "unknown";
 		STARPU_ASSERT(0);
 	}
 	snprintf(name, size, "%s %u\n", prefix, descr.devid[node]);

+ 4 - 3
src/datawizard/user_interactions.c

@@ -157,7 +157,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t h
 		wrapper->post_sync_task->name = "acquire_cb_post";
 		wrapper->post_sync_task->detach = 1;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, &_starpu_get_job_associated_to_task(wrapper->post_sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task)
@@ -277,7 +277,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 		wrapper.post_sync_task->name = "acquire_post";
 		wrapper.post_sync_task->detach = 1;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, &_starpu_get_job_associated_to_task(wrapper.post_sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
 		{
@@ -322,7 +322,8 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 	 * We enqueue the "post" sync task in the list associated to the handle
 	 * so that it is submitted by the starpu_data_release
 	 * function. */
-	_starpu_add_post_sync_tasks(wrapper.post_sync_task, handle);
+	if (sequential_consistency)
+		_starpu_add_post_sync_tasks(wrapper.post_sync_task, handle);
 
         _STARPU_LOG_OUT();
 	return 0;

+ 128 - 51
src/debug/traces/starpu_fxt.c

@@ -46,21 +46,29 @@ static unsigned other_index = 0;
 
 static void set_next_other_worker_color(int workerid)
 {
+	if (workerid >= STARPU_NMAXWORKERS)
+		return;
 	worker_colors[workerid] = other_worker_colors[other_index++];
 }
 
 static void set_next_cpu_worker_color(int workerid)
 {
+	if (workerid >= STARPU_NMAXWORKERS)
+		return;
 	worker_colors[workerid] = cpus_worker_colors[cpus_index++];
 }
 
 static void set_next_cuda_worker_color(int workerid)
 {
+	if (workerid >= STARPU_NMAXWORKERS)
+		return;
 	worker_colors[workerid] = cuda_worker_colors[cuda_index++];
 }
 
 static void set_next_opencl_worker_color(int workerid)
 {
+	if (workerid >= STARPU_NMAXWORKERS)
+		return;
 	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
 }
 
@@ -76,6 +84,8 @@ static void set_next_scc_worker_color(int workerid)
 
 static const char *get_worker_color(int workerid)
 {
+	if (workerid >= STARPU_NMAXWORKERS)
+		workerid = STARPU_NMAXWORKERS - 1;
 	return worker_colors[workerid];
 }
 
@@ -103,6 +113,9 @@ static unsigned get_colour_symbol_blue(char *name)
 static double last_codelet_start[STARPU_NMAXWORKERS];
 /* _STARPU_FUT_DO_PROBE4STR records only 4 longs */
 static char last_codelet_symbol[STARPU_NMAXWORKERS][4*sizeof(unsigned long)];
+static int last_codelet_parameter[STARPU_NMAXWORKERS];
+#define MAX_PARAMETERS 8
+static char last_codelet_parameter_description[STARPU_NMAXWORKERS][MAX_PARAMETERS][FXT_MAX_PARAMS*sizeof(unsigned long)];
 
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -154,7 +167,7 @@ struct worker_entry
 	int workerid;
 } *worker_ids;
 
-static void register_worker_id(unsigned long tid, int workerid)
+static int register_worker_id(unsigned long tid, int workerid)
 {
 	nworkers++;
 	struct worker_entry *entry;
@@ -164,15 +177,15 @@ static void register_worker_id(unsigned long tid, int workerid)
 	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
 
 	/* only register a thread once */
-	//STARPU_ASSERT(entry == NULL);
 	if (entry)
-		return;
+		return 0;
 
 	entry = malloc(sizeof(*entry));
 	entry->tid = tid;
 	entry->workerid = workerid;
 
 	HASH_ADD(hh, worker_ids, tid, sizeof(tid), entry);
+	return 1;
 }
 
 static int find_worker_id(unsigned long tid)
@@ -264,48 +277,59 @@ static void memnode_set_state(double time, const char *prefix, unsigned int memn
 #endif
 }
 
-static void worker_set_state(double time, const char *prefix, long unsigned int workerid, const char *name)
+static void thread_set_state(double time, const char *prefix, long unsigned int threadid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
 	poti_SetState(time, container, "S", name);
 #else
-	fprintf(out_paje_file, "10	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
+	fprintf(out_paje_file, "10	%.9f	%st%lu	S	%s\n", time, prefix, threadid, name);
 #endif
 }
 
-static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+static void worker_set_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
-	/* TODO: set detailed state */
-	poti_SetState(time, container, "S", name);
+	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_SetState(time, container, "WS", name);
 #else
-	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
+	fprintf(out_paje_file, "10	%.9f	%sw%lu	WS	%s\n", time, prefix, workerid, name);
 #endif
 }
 
-static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
+static void thread_push_state(double time, const char *prefix, long unsigned int threadid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
 	poti_PushState(time, container, "S", name);
 #else
-	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
+	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, threadid, name);
 #endif
 }
 
-static void worker_pop_state(double time, const char *prefix, long unsigned int workerid)
+static void thread_pop_state(double time, const char *prefix, long unsigned int threadid)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
 	poti_PopState(time, container, "S");
 #else
-	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, workerid);
+	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, threadid);
+#endif
+}
+
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	/* TODO: set detailed state */
+	poti_SetState(time, container, "WS", name);
+#else
+	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx\n", time, prefix, workerid, name, size, parameters, footprint, tag);
 #endif
 }
 
@@ -371,8 +395,9 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	int workerid = ev->param[1];
 	int nodeid = ev->param[3];
 	int threadid = ev->param[4];
+	int new_thread;
 
-	register_worker_id(threadid, workerid);
+	new_thread = register_worker_id(threadid, workerid);
 
 	char *kindstr = "";
 	struct starpu_perfmodel_arch arch;
@@ -435,11 +460,13 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		snprintf(new_thread_container_name, STARPU_POTI_STR_LEN, "%s%d", prefix, threadid);
 		char new_worker_container_name[STARPU_POTI_STR_LEN];
 		snprintf(new_worker_container_name, STARPU_POTI_STR_LEN, "%s%s%d", prefix, kindstr, devid);
-		poti_CreateContainer(get_event_time_stamp(ev, options), new_thread_container_alias, "T", memnode_container, new_thread_container_name);
+		if (new_thread)
+			poti_CreateContainer(get_event_time_stamp(ev, options), new_thread_container_alias, "T", memnode_container, new_thread_container_name);
 		poti_CreateContainer(get_event_time_stamp(ev, options), new_worker_container_alias, "W", new_thread_container_alias, new_worker_container_name);
 #else
-		fprintf(out_paje_file, "7	%.9f	%st%d	T	%smn%d	%s%d\n",
-			get_event_time_stamp(ev, options), prefix, threadid, prefix, nodeid, prefix, threadid);
+		if (new_thread)
+			fprintf(out_paje_file, "7	%.9f	%st%d	T	%smn%d	%s%d\n",
+				get_event_time_stamp(ev, options), prefix, threadid, prefix, nodeid, prefix, threadid);
 		fprintf(out_paje_file, "7	%.9f	%sw%d	W	%st%d	%s%s%d\n",
 			get_event_time_stamp(ev, options), prefix, workerid, prefix, threadid, prefix, kindstr, devid);
 #endif
@@ -447,7 +474,7 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 	/* start initialization */
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), prefix, threadid, "I");
+		thread_set_state(get_event_time_stamp(ev, options), prefix, threadid, "I");
 
 	if (activity_file)
 	fprintf(activity_file, "name\t%d\t%s %d\n", workerid, kindstr, devid);
@@ -462,13 +489,16 @@ static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	int worker;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
 
 	if (ev->nb_params < 2)
 		worker = find_worker_id(ev->param[0]);
 	else
 		worker = ev->param[1];
 
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), prefix, worker, "I");
+
 	/* Initilize the accumulated time counters */
 	last_activity_flush_timestamp[worker] = get_event_time_stamp(ev, options);
 	accumulated_sleep_time[worker] = 0.0;
@@ -480,7 +510,7 @@ static void handle_worker_deinit_start(struct fxt_ev_64 *ev, struct starpu_fxt_o
 	char *prefix = options->file_prefix;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "D");
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "D");
 }
 
 static void handle_worker_deinit_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -556,7 +586,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 	if (out_paje_file)
 	{
 #ifdef STARPU_HAVE_POTI
-		create_paje_state_color(name, "S", red, green, blue);
+		create_paje_state_color(name, "WS", red, green, blue);
 		int i;
 		for(i = 1; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
@@ -595,7 +625,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 /* 		create_paje_state_color(name, "Ctx9", .0, .0, 1.0); */
 /* 		create_paje_state_color(name, "Ctx10", 154.0, 205.0, 50.0); */
 #else
-		fprintf(out_paje_file, "6	%s	S	%s	\"%f %f %f\" \n", name, name, red, green, blue);
+		fprintf(out_paje_file, "6	%s	WS	%s	\"%f %f %f\" \n", name, name, red, green, blue);
 		int i;
 		for(i = 1; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
@@ -640,8 +670,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 
 static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	int worker;
-	worker = find_worker_id(ev->param[2]);
+	int worker = ev->param[2];
 
 	if (worker < 0) return;
 
@@ -649,6 +678,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
 	snprintf(last_codelet_symbol[worker], sizeof(last_codelet_symbol[worker]), "%s", name);
+	last_codelet_parameter[worker] = 0;
 
 	double start_codelet_time = get_event_time_stamp(ev, options);
 	last_codelet_start[worker] = start_codelet_time;
@@ -668,10 +698,10 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 			char container[STARPU_POTI_STR_LEN];
 			char ctx[6];
 			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
-			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
+			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
 			poti_SetState(start_codelet_time, container, ctx, name);
 #else
-			fprintf(out_paje_file, "10	%.9f	%st%"PRIu64"	Ctx%d	%s\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
+			fprintf(out_paje_file, "10	%.9f	%sw%"PRIu64"	Ctx%d	%s\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
 #endif
 		}
 	}
@@ -679,11 +709,25 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
 }
 
+static void handle_codelet_data(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	int worker = ev->param[0];
+	if (worker < 0) return;
+	if (out_paje_file)
+	{
+		int num = last_codelet_parameter[worker]++;
+		if (num >= MAX_PARAMETERS)
+			return;
+		snprintf(last_codelet_parameter_description[worker][num], sizeof(last_codelet_parameter_description[worker][num]), "%s", (char*) &ev->param[1]);
+	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
+}
+
 static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
-	int worker;
-	worker = find_worker_id(ev->param[5]);
+	int worker = ev->param[5];
 
 	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
@@ -692,17 +736,25 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
 	if (out_paje_file)
 	{
-		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
+		int i;
+		char parameters[256];
+		size_t eaten = 0;
+		for (i = 0; i < last_codelet_parameter[worker] && i < MAX_PARAMETERS; i++)
+		{
+			eaten += snprintf(parameters + eaten, sizeof(parameters) - eaten, "%s%s", i?"_":"", last_codelet_parameter_description[worker][i]);
+		}
+
+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4]);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
 			char container[STARPU_POTI_STR_LEN];
 			char ctx[6];
 			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
-			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
+			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
 			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
 #else
-			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
 #endif
 		}
 	}
@@ -714,8 +766,7 @@ static struct starpu_fxt_codelet_event *dumped_codelets;
 
 static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	int worker;
-	worker = find_worker_id(ev->param[6]);
+	int worker = ev->param[6];
 	if (worker < 0) return;
 
 	char *prefix = options->file_prefix;
@@ -726,7 +777,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	uint32_t codelet_hash = ev->param[2];
 
 	if (out_paje_file)
-		worker_set_state(end_codelet_time, prefix, ev->param[6], "B");
+		worker_set_state(end_codelet_time, prefix, ev->param[6], "I");
 
 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
 
@@ -753,6 +804,22 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	}
 }
 
+static void handle_start_thread_executing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "E");
+}
+
+static void handle_end_thread_executing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
+}
+
 static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
@@ -797,7 +864,7 @@ static void handle_start_callback(struct fxt_ev_64 *ev, struct starpu_fxt_option
 		return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "C");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "C");
 }
 
 static void handle_end_callback(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -808,7 +875,7 @@ static void handle_end_callback(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "B");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "B");
 }
 
 static void handle_hyp_begin(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -819,7 +886,7 @@ static void handle_hyp_begin(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 		return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "H");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "H");
 }
 
 static void handle_hyp_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -830,7 +897,7 @@ static void handle_hyp_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
 }
 
 static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
@@ -841,7 +908,7 @@ static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], newstatus);
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], newstatus);
 }
 
 static double last_sleep_start[STARPU_NMAXWORKERS];
@@ -853,7 +920,7 @@ static void handle_start_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 }
 
 static void handle_end_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -863,7 +930,7 @@ static void handle_end_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_option
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
 }
 
 static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -873,7 +940,7 @@ static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
+		thread_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 }
 
 static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -883,7 +950,7 @@ static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_option
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
+		thread_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
 }
 
 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -896,7 +963,7 @@ static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *
 	last_sleep_start[worker] = start_sleep_time;
 
 	if (out_paje_file)
-		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sl");
+		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sl");
 }
 
 static void handle_end_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -908,7 +975,7 @@ static void handle_end_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 	double end_sleep_timestamp = get_event_time_stamp(ev, options);
 
 	if (out_paje_file)
-		worker_set_state(end_sleep_timestamp, options->file_prefix, ev->param[0], "B");
+		thread_set_state(end_sleep_timestamp, options->file_prefix, ev->param[0], "B");
 
 	double sleep_length = end_sleep_timestamp - last_sleep_start[worker];
 
@@ -1602,6 +1669,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 			case _STARPU_FUT_START_CODELET_BODY:
 				handle_start_codelet_body(&ev, options);
 				break;
+			case _STARPU_FUT_CODELET_DATA:
+				handle_codelet_data(&ev, options);
+				break;
 			case _STARPU_FUT_CODELET_DETAILS:
 				handle_codelet_details(&ev, options);
 				break;
@@ -1609,6 +1679,13 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_end_codelet_body(&ev, options);
 				break;
 
+			case _STARPU_FUT_START_EXECUTING:
+				handle_start_thread_executing(&ev, options);
+				break;
+			case _STARPU_FUT_END_EXECUTING:
+				handle_end_thread_executing(&ev, options);
+				break;
+
 			case _STARPU_FUT_START_CALLBACK:
 				handle_start_callback(&ev, options);
 				break;
@@ -2354,7 +2431,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			break;
 
 		case _STARPU_FUT_START_CODELET_BODY:
-			workerid = find_worker_id(ev.param[2]);
+			workerid = ev.param[2];
 			tasks[workerid].exec_time = ev.time;
 			has_name = ev.param[3];
 			tasks[workerid].codelet_name = strdup(has_name ? (char *) &ev.param[4] : "unknown");
@@ -2362,7 +2439,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			break;
 
 		case _STARPU_FUT_END_CODELET_BODY:
-			workerid = find_worker_id(ev.param[6]);
+			workerid = ev.param[6];
 			assert(workerid != -1);
 			tasks[workerid].exec_time = ev.time - tasks[workerid].exec_time;
 			write_task(tasks[workerid]);

+ 10 - 1
src/debug/traces/starpu_paje.c

@@ -137,6 +137,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%	Size	string\n");
+	fprintf(file, "%%	Params	string\n");
 	fprintf(file, "%%	Footprint	string\n");
 	fprintf(file, "%%	Tag	string\n");
 	fprintf(file, "%%EndEventDef\n");
@@ -176,10 +177,13 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("Po", "S", "PushingOutput", "0.1 1.0 1.0");
 	poti_DefineEntityValue("C", "S", "Callback", ".0 .3 .8");
 	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
+	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
+	poti_DefineStateType("WS", "W", "Worker State");
+	poti_DefineEntityValue("I", "WS", "Idle", ".9 .1 .0");
 
 	/* Types for the MPI Communication Thread of the Memory Node */
 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
@@ -205,6 +209,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 		poti_DefineEntityValue("Po", ctx, "PushingOutput", "0.1 1.0 1.0");
 		poti_DefineEntityValue("C", ctx, "Callback", ".0 .3 .8");
 		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
+		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
@@ -249,10 +254,13 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       S       Callback       \".0 .3 .8\"            \n\
 6       B       S       Overhead         \".5 .18 .0\"		\n\
+6       E       S       Executing         \".0 .6 .5\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       P       S       Progressing         \".4 .1 .6\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
+3       WS       W       \"Worker State\"                        \n\
+6       I       WS       Idle         \".9 .1 .0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n");
 	fprintf(file, "\
 6       P       CtS       Processing         \"0 0 0\"		\n\
@@ -271,11 +279,12 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Po       Ctx%u      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
 6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
+6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"		\n",
-		i, i, i, i, i, i, i, i, i, i);
+		i, i, i, i, i, i, i, i, i, i, i);
 	fprintf(file, "\
 6       A       MS      Allocating         \".4 .1 .0\"		\n\
 6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n\

+ 2 - 0
src/drivers/cpu/driver_cpu.c

@@ -94,11 +94,13 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		STARPU_ASSERT_MSG(func, "when STARPU_CPU is defined in 'where', cpu_func or cpu_funcs has to be defined");
 		if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 		{
+			_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 			_starpu_simgrid_execute_job(j, perf_arch, NAN);
 #else
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+			_STARPU_TRACE_END_EXECUTING();
 		}
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */

+ 21 - 2
src/drivers/cuda/driver_cuda.c

@@ -408,11 +408,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 
 	if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 	{
+		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+		_STARPU_TRACE_END_EXECUTING();
 	}
 
 	return 0;
@@ -492,8 +494,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *worker = &worker_set->workers[i];
-		_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
+		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 
 	/* tell the main thread that this one is ready */
@@ -550,6 +551,15 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			_starpu_set_local_worker_key(args);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
 			idle++;
+#ifdef STARPU_USE_FXT
+			int k;
+			for (k = 0; k < (int) worker_set->nworkers; k++)
+				if (worker_set->workers[k].current_task)
+					break;
+			if (k == (int) worker_set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_END_EXECUTING()
+#endif
 		}
 	}
 
@@ -612,6 +622,15 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		{
 			/* Record event to synchronize with task termination later */
 			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
+#ifdef STARPU_USE_FXT
+			int k;
+			for (k = 0; k < (int) worker_set->nworkers; k++)
+				if (worker_set->workers[k].current_task)
+					break;
+			if (k < (int) worker_set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_START_EXECUTING()
+#endif
 		}
 		else
 #else

+ 11 - 10
src/drivers/driver_common/driver_common.c

@@ -33,7 +33,7 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
@@ -73,7 +73,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 
-	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -85,7 +85,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
 
-	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
+	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 
 	if (cl && cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
@@ -304,13 +304,13 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		return NULL;
 	}
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
-
 	_starpu_worker_set_status_scheduling_done(workerid);
 
 	_starpu_worker_set_status_wakeup(workerid);
 	args->spinning_backoff = BACKOFF_MIN;
 
+	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+
 
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
@@ -341,13 +341,16 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		/*else try to pop a task*/
 		else
 		{
-			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
+			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			_starpu_set_local_worker_key(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
 			if(tasks[i] != NULL)
 			{
+				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
+				_starpu_worker_set_status_wakeup(workers[i].workerid);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
+
 				count ++;
 				j = _starpu_get_job_associated_to_task(tasks[i]);
 				is_parallel_task = (j->task_size > 1);
@@ -370,13 +373,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 					workers[i].worker_size = 1;
 					workers[i].current_rank = 0;
 				}
-
-				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
-				_starpu_worker_set_status_wakeup(workers[i].workerid);
 			}
 			else
 			{
 				_starpu_worker_set_status_sleeping(workers[i].workerid);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
 			}
 		}
 	}

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -99,10 +99,10 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	}
 
 	/* count the number of in/inout/out buffers */
-	unsigned nbuffers = cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		switch (mode)
 		{
@@ -122,7 +122,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		unsigned gordon_buffer;
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		switch (mode)
 		{

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -429,7 +429,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 	_starpu_src_common_execute_kernel(node, kernel, worker->devid, task->cl->type,
 			(j->task_size > 1),
 			j->combined_workerid, task->handles,
-			task->interfaces, task->cl->nbuffers,
+			task->interfaces, STARPU_TASK_GET_NBUFFERS(task),
 			task->cl_arg, task->cl_arg_size);
 
 

+ 6 - 2
src/drivers/opencl/driver_opencl.c

@@ -568,7 +568,6 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 int _starpu_opencl_driver_init(struct _starpu_worker *args)
 {
 	int devid = args->devid;
-	int workerid = args->workerid;
 
 	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
 
@@ -599,7 +598,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *args)
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END(workerid);
+	_STARPU_TRACE_WORKER_INIT_END(args->workerid);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
@@ -635,12 +634,14 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
 		if (status != CL_COMPLETE)
 		{
+			_STARPU_TRACE_START_EXECUTING();
 			/* Not ready yet, no better thing to do than waiting */
 			__starpu_datawizard_progress(memnode, 1, 0);
 			return 0;
 		}
 
 		/* Asynchronous task completed! */
+		_STARPU_TRACE_END_EXECUTING();
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 	}
 #endif /* STARPU_SIMGRID */
@@ -698,6 +699,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		 */
 		err = clEnqueueMarker(queue, &task_events[args->devid]);
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+		_STARPU_TRACE_START_EXECUTING();
 	}
 	else
 #else
@@ -832,6 +834,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
 	if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 	{
+		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
@@ -851,6 +854,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+		_STARPU_TRACE_END_EXECUTING();
 	}
 	return 0;
 }

+ 2 - 2
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -82,7 +82,7 @@ static const float idle_power_maximum=10000.0;
 static int count_non_ready_buffers(struct starpu_task *task, unsigned node)
 {
 	int cnt = 0;
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
 	for (index = 0; index < nbuffers; index++)
@@ -1019,7 +1019,7 @@ struct starpu_sched_policy _starpu_sched_dm_policy =
 	.remove_workers = dmda_remove_workers,
 	.push_task = dm_push_task,
 	.pop_task = dmda_pop_task,
-	.pre_exec_hook = NULL,
+	.pre_exec_hook = dmda_pre_exec_hook,
 	.post_exec_hook = dmda_post_exec_hook,
 	.pop_every_task = dmda_pop_every_task,
 	.policy_name = "dm",

+ 6 - 6
src/sched_policies/locality_work_stealing_policy.c

@@ -288,15 +288,15 @@ static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nwork
 		for(;;)
 		{
 			neighbour = (struct starpu_tree*)it.value;
-			int workerids[STARPU_NMAXWORKERS];
-			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+			int neigh_workerids[STARPU_NMAXWORKERS];
+			int neigh_nworkers = _starpu_worker_get_workerids(neighbour->id, neigh_workerids);
 			int w;
-			for(w = 0; w < nworkers; w++)
+			for(w = 0; w < neigh_nworkers; w++)
 			{
-				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
+				if(!it.visited[neigh_workerids[w]] && workers->present[neigh_workerids[w]])
 				{
-					ws->proxlist[workerid][cnt++] = workerids[w];
-					it.visited[workerids[w]] = 1;
+					ws->proxlist[workerid][cnt++] = neigh_workerids[w];
+					it.visited[neigh_workerids[w]] = 1;
 				}
 			}
 			if(!workers->has_next(workers, &it))

+ 2 - 4
src/sched_policies/work_stealing_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012  INRIA
  *
@@ -258,9 +258,7 @@ static inline unsigned select_worker(unsigned sched_ctx_id)
 }
 
 
-#ifdef STARPU_DEVEL
-#warning TODO rewrite ... this will not scale at all now
-#endif
+/* Note: this is not scalable work stealing,  use lws instead */
 static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 {
 	struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

+ 10 - 5
src/util/starpu_task_insert.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,7 +29,7 @@ void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 
 	/* Compute the size */
 	va_start(varg_list, arg_buffer_size);
-	*arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
+	_starpu_task_insert_get_args_size(varg_list, NULL, arg_buffer_size);
 	va_end(varg_list);
 
 	va_start(varg_list, arg_buffer_size);
@@ -71,11 +71,12 @@ struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char*
 	void *arg_buffer = NULL;
 	va_list varg_list_copy;
 	size_t arg_buffer_size = 0;
+	unsigned nbuffers;
 
 	/* Compute the size */
 
 	va_copy(varg_list_copy, varg_list);
-	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list_copy);
+	_starpu_task_insert_get_args_size(varg_list_copy, &nbuffers, &arg_buffer_size);
 	va_end(varg_list_copy);
 
 	if (arg_buffer_size)
@@ -89,9 +90,13 @@ struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char*
 	task->name = task_name;
 	task->cl_arg_free = cl_arg_free;
 
-	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
+	if (cl && cl->nbuffers != STARPU_VARIABLE_NBUFFERS)
 	{
-		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+		STARPU_ASSERT_MSG(nbuffers == (unsigned) cl->nbuffers, "Incoherent number of buffers between cl (%d) and number of parameters (%u)", cl->nbuffers, nbuffers);
+	}
+	if (nbuffers > STARPU_NMAXBUFS)
+	{
+		task->dyn_handles = malloc(nbuffers * sizeof(starpu_data_handle_t));
 	}
 
 	va_copy(varg_list_copy, varg_list);

+ 17 - 6
src/util/starpu_task_insert_utils.c

@@ -41,12 +41,14 @@ void starpu_task_insert_callback_wrapper(void *_cl_arg_wrapper)
 		cl_arg_wrapper->callback_func(cl_arg_wrapper->callback_arg);
 }
 
-size_t _starpu_task_insert_get_arg_size(va_list varg_list)
+void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, size_t *cl_arg_size)
 {
 	int arg_type;
 	size_t arg_buffer_size;
+	unsigned n;
 
 	arg_buffer_size = 0;
+	n = 0;
 
 	arg_buffer_size += sizeof(char);
 
@@ -55,11 +57,13 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		if (arg_type & STARPU_R || arg_type & STARPU_W || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
 		{
 			(void)va_arg(varg_list, starpu_data_handle_t);
+			n++;
 		}
 		else if (arg_type==STARPU_DATA_ARRAY)
 		{
 			(void)va_arg(varg_list, starpu_data_handle_t*);
-			(void)va_arg(varg_list, int);
+			int nb_handles = va_arg(varg_list, int);
+			n += nb_handles;
 		}
 		else if (arg_type==STARPU_VALUE)
 		{
@@ -136,7 +140,10 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		}
 	}
 
-	return arg_buffer_size;
+	if (cl_arg_size)
+		*cl_arg_size = arg_buffer_size;
+	if (nbuffers)
+		*nbuffers = n;
 }
 
 int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list)
@@ -280,6 +287,8 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 
 	prologue_pop_cl_arg_wrapper->callback_func = NULL;
 
+	(*task)->cl = cl;
+
 	while((arg_type = va_arg(varg_list, int)) != 0)
 	{
 		if (arg_type & STARPU_R || arg_type & STARPU_W || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
@@ -292,7 +301,9 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 			STARPU_ASSERT(cl != NULL);
 
 			STARPU_TASK_SET_HANDLE((*task), handle, current_buffer);
-			if (STARPU_CODELET_GET_MODE(cl, current_buffer))
+			if (cl->nbuffers == STARPU_VARIABLE_NBUFFERS)
+				STARPU_TASK_SET_MODE(*task, mode, current_buffer);
+			else if (STARPU_CODELET_GET_MODE(cl, current_buffer))
 			{
 				STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(cl, current_buffer) == mode,
 						   "The codelet <%s> defines the access mode %d for the buffer %d which is different from the mode %d given to starpu_task_insert\n",
@@ -426,9 +437,9 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 		}
 	}
 
-	STARPU_ASSERT(cl == NULL || current_buffer == cl->nbuffers);
+	if (cl && cl->nbuffers == STARPU_VARIABLE_NBUFFERS)
+		(*task)->nbuffers = current_buffer;
 
-	(*task)->cl = cl;
 	(*task)->cl_arg = arg_buffer;
 	(*task)->cl_arg_size = arg_buffer_size;
 

+ 1 - 1
src/util/starpu_task_insert_utils.h

@@ -21,7 +21,7 @@
 #include <stdarg.h>
 #include <starpu.h>
 
-size_t _starpu_task_insert_get_arg_size(va_list varg_list);
+void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, size_t *cl_arg_size);
 int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list);
 void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
 

+ 2 - 2
src/worker_collection/worker_list.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Université de Bordeaux 1
+ * Copyright (C) 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011-2013  INRIA
  *
@@ -173,7 +173,7 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 static void _init_workers(int *workerids)
 {
 	unsigned i;
-	int nworkers = starpu_worker_get_count();
+	unsigned nworkers = starpu_worker_get_count();
 	for(i = 0; i < nworkers; i++)
 		workerids[i] = -1;
 	return;

+ 14 - 2
tests/Makefile.am

@@ -108,7 +108,6 @@ XFAIL_TESTS=	errorcheck/invalid_blocking_calls
 
 noinst_PROGRAMS =				\
 	main/deprecated_func			\
-	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/run_driver              \
 	main/deploop                            \
@@ -198,6 +197,8 @@ noinst_PROGRAMS =				\
 	datawizard/in_place_partition   	\
 	datawizard/partition_lazy		\
 	datawizard/gpu_register   		\
+	datawizard/gpu_ptr_register   		\
+	datawizard/variable_parameters		\
 	datawizard/wt_host			\
 	datawizard/wt_broadcast			\
 	datawizard/readonly			\
@@ -383,6 +384,18 @@ datawizard_gpu_register_SOURCES +=	\
 	datawizard/scal_opencl.cl
 endif
 
+datawizard_gpu_ptr_register_SOURCES =	\
+	datawizard/gpu_ptr_register.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_gpu_ptr_register_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_gpu_ptr_register_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
 datawizard_wt_host_SOURCES =			\
 	datawizard/wt_host.c
 datawizard_wt_broadcast_SOURCES =		\
@@ -400,7 +413,6 @@ datawizard_specific_node_SOURCES +=			\
 endif
 
 main_deprecated_func_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
-main_deprecated_buffer_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 
 main_subgraph_repeat_SOURCES =		\
 	main/subgraph_repeat.c

+ 293 - 0
tests/datawizard/gpu_ptr_register.c

@@ -0,0 +1,293 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include "scal.h"
+
+#if ! (defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA))
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+
+static int
+submit_tasks(starpu_data_handle_t handle, int pieces, int n)
+{
+	int i, ret;
+
+	for (i = 0; i < pieces; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i%n;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			return -ENODEV;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	return 0;
+}
+
+static int
+find_a_worker(enum starpu_worker_archtype type)
+{
+	int worker;
+	int ret = starpu_worker_get_ids_by_type(type, &worker, 1);
+	if (ret == 0)
+		return -ENODEV;
+	return worker;
+}
+
+static int
+check_result(unsigned *t, size_t size)
+{
+	unsigned i;
+	for (i = 0; i < size; i++)
+	{
+		if (t[i] != i*2)
+		{
+			FPRINTF(stderr,"t[%d] is %u instead of %u\n", i, t[i], 2*i);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000
+static int
+test_cuda(void)
+{
+	int ret;
+	unsigned *foo_gpu;
+	unsigned *foo;
+	int n, i, size, pieces;
+	int devid;
+	int chosen;
+	cudaError_t cures;
+	starpu_data_handle_t handle;
+
+	/* Find a CUDA worker */
+	chosen = find_a_worker(STARPU_CUDA_WORKER);
+	if (chosen == -ENODEV)
+		return -ENODEV;
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	devid = starpu_worker_get_devid(chosen);
+	starpu_cuda_set_device(devid);
+	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+
+	foo = calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)foo, size, sizeof(*foo));
+	starpu_vector_ptr_register(handle, starpu_worker_get_memory_node(chosen), (uintptr_t)foo_gpu, (uintptr_t)foo_gpu, 0);
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	/* Even with just one worker, split in at least two */
+	if (n == 1)
+		pieces = 2;
+	else
+		pieces = n;
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vector_filter_block,
+		.nchildren = pieces,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	ret = submit_tasks(handle, pieces, n);
+	if (ret == -ENODEV)
+		return -ENODEV;
+
+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
+	starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(chosen), 0);
+	starpu_data_unregister(handle);
+
+	starpu_cuda_set_device(devid);
+	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	return check_result(foo, size);
+}
+#endif
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static int
+test_opencl(void)
+{
+	int i;
+	int ret;
+	int chosen;
+	int n;
+	int size;
+	int pieces;
+	cl_mem foo_gpu;
+	starpu_data_handle_t handle;
+
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	/* Find an OpenCL worker */
+	chosen = find_a_worker(STARPU_OPENCL_WORKER);
+	if (chosen == -ENODEV)
+		return -ENODEV;
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	int devid;
+	cl_int err;
+	cl_context context;
+	cl_command_queue queue;
+
+	devid = starpu_worker_get_devid(chosen);
+
+	starpu_opencl_get_context(devid, &context);
+	starpu_opencl_get_queue(devid, &queue);
+
+	foo_gpu = clCreateBuffer(context, CL_MEM_READ_WRITE, size*sizeof(int), NULL, &err);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	unsigned int *foo = malloc(size*sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	starpu_vector_data_register(&handle,
+				    STARPU_MAIN_RAM,
+				    (uintptr_t)foo,
+				    size,
+				    sizeof(int));
+
+	starpu_vector_ptr_register(handle,
+				    starpu_worker_get_memory_node(chosen),
+				    (uintptr_t)foo_gpu,
+				    (uintptr_t)foo_gpu,
+				    0);
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	/* Even with just one worker, split in at least two */
+	if (n == 1)
+		pieces = 2;
+	else
+		pieces = n;
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vector_filter_block,
+		.nchildren = pieces,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	ret = submit_tasks(handle, pieces, n);
+	if (ret == -ENODEV)
+		return -ENODEV;
+
+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
+	starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(chosen), 0);
+	starpu_data_unregister(handle);
+
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	err = clEnqueueReadBuffer(queue,
+				  foo_gpu,
+				  CL_FALSE,
+				  0,
+				  size*sizeof(*foo),
+				  foo,
+				  0,
+				  NULL,
+				  NULL);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	clFinish(queue);
+	return check_result(foo, size);
+}
+#endif /* !STARPU_USE_OPENCL */
+
+int main(int argc, char **argv)
+{
+	int skipped_cuda = 1, skipped_opencl = 1;
+	int ret;
+	ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+	ret = test_cuda();
+	if (ret == 1)
+		goto fail;
+	else if (ret == 0)
+		skipped_cuda = 0;
+#endif
+#endif
+
+#ifdef STARPU_USE_OPENCL
+	ret = test_opencl();
+	if (ret == 1)
+		goto fail;
+	else if (ret == 0)
+		skipped_opencl = 0;
+#endif
+
+	starpu_shutdown();
+
+	if (skipped_cuda == 1 && skipped_opencl == 1)
+		return STARPU_TEST_SKIPPED;
+
+	return EXIT_SUCCESS;
+
+fail:
+	starpu_shutdown();
+	return EXIT_FAILURE;
+}
+
+#endif /* defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) */

+ 230 - 0
tests/datawizard/variable_parameters.c

@@ -0,0 +1,230 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <starpu.h>
+#include "../helper.h"
+
+static starpu_data_handle_t handle1, handle2, handle3, handle4;
+
+/*
+ *	Increment codelet
+ */
+
+#ifdef STARPU_USE_OPENCL
+/* dummy OpenCL implementation */
+static void increment_opencl_kernel(void *descr[], void *cl_arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[i]);
+		unsigned h_token;
+
+		cl_command_queue queue;
+		starpu_opencl_get_current_queue(&queue);
+
+		clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+		h_token++;
+		clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+		clFinish(queue);
+	}
+}
+#endif
+
+
+#ifdef STARPU_USE_CUDA
+static void increment_cuda_kernel(void *descr[], void *arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		unsigned host_token;
+
+		/* This is a dummy technique of course */
+		cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+		host_token++;
+
+		cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	}
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+static void increment_cpu_kernel(void *descr[], void *cl_arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		*tokenptr = *tokenptr + 1;
+	}
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = STARPU_VARIABLE_NBUFFERS,
+};
+
+int main(int argc, char **argv)
+{
+	unsigned *pvar = NULL;
+	int ret;
+	unsigned var1 = 0, var2 = 0, var3 = 0, var4 = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle1, STARPU_MAIN_RAM, (uintptr_t)&var1, sizeof(unsigned));
+	starpu_variable_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)&var2, sizeof(unsigned));
+	starpu_variable_data_register(&handle3, STARPU_MAIN_RAM, (uintptr_t)&var3, sizeof(unsigned));
+	starpu_variable_data_register(&handle4, STARPU_MAIN_RAM, (uintptr_t)&var4, sizeof(unsigned));
+
+#ifdef STARPU_QUICK_CHECK
+	unsigned nloops = 4;
+#else
+	unsigned nloops = 16;
+#endif
+
+	unsigned loop;
+	unsigned t;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		for (t = 0; t <= 4; t++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			unsigned i;
+
+			task->cl = &increment_cl;
+			task->handles[0] = handle1;
+			task->handles[1] = handle2;
+			task->handles[2] = handle3;
+			task->handles[3] = handle4;
+			for (i = 0; i < t; i++)
+				task->modes[i] = STARPU_RW;
+			task->nbuffers = t;
+
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				STARPU_RW, handle3,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				STARPU_RW, handle3,
+				STARPU_RW, handle4,
+				0);
+	}
+
+	ret = starpu_data_acquire(handle1, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var1 != 8*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var1, 8*nloops);
+		starpu_data_release(handle1);
+		goto err;
+	}
+	starpu_data_release(handle1);
+
+	ret = starpu_data_acquire(handle2, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var2 != 6*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var2, 6*nloops);
+		starpu_data_release(handle2);
+		goto err;
+	}
+	starpu_data_release(handle2);
+
+	ret = starpu_data_acquire(handle3, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var3 != 4*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var3, 4*nloops);
+		starpu_data_release(handle3);
+		goto err;
+	}
+	starpu_data_release(handle3);
+
+	ret = starpu_data_acquire(handle4, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var4 != 2*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var4, 2*nloops);
+		starpu_data_release(handle4);
+		goto err;
+	}
+	starpu_data_release(handle4);
+
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	starpu_shutdown();
+	return EXIT_FAILURE;
+}

+ 22 - 14
tests/loader.c

@@ -129,28 +129,36 @@ static void test_cleaner(int sig)
 	exit(EXIT_FAILURE);
 }
 
+static int _decode(char **src, char *motif, const char *value)
+{
+	char *found;
+
+	found = strstr(*src, motif);
+	if (found == NULL) return 0;
+
+	char *new_src = malloc((strlen(*src)+strlen(value))*sizeof(char));
+	strcpy(new_src, "");
+
+	strncat(new_src, *src, strlen(*src)-strlen(found));
+	strcat(new_src, value);
+	strcat(new_src, found+strlen(motif));
+
+	*src = strdup(new_src);
+	return 1;
+}
+
 static void decode(char **src, char *motif, const char *value)
 {
 	if (*src)
 	{
-		char *y = strstr(*src, motif);
-		if (y && value == NULL)
+		if (strstr(*src, motif) && value == NULL)
 		{
 			fprintf(stderr, "error: $%s undefined\n", motif);
 			exit(EXIT_FAILURE);
 		}
-		while (y)
-		{
-			char *neo = malloc((strlen(*src)-strlen(motif)+strlen(value)) * sizeof(char));
-			char *to = neo;
-
-			to = strncpy(to, *src, strlen(*src)-strlen(y)); to += strlen(*src)-strlen(y);
-			to = strcpy(to, value); to += strlen(value);
-			strcpy(to, y+strlen(motif));
-
-			*src = strdup(neo);
-			y = strstr(*src, motif);
-		}
+		int d = _decode(src, motif, value);
+		while (d)
+			d = _decode(src, motif, value);
 	}
 }
 

+ 0 - 178
tests/main/deprecated_buffer.c

@@ -1,178 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <config.h>
-#include <starpu.h>
-#include "../helper.h"
-
-void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
-{
-	int *valin = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	int *valout = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
-
-	*valout = *valin;
-}
-
-struct starpu_codelet cl_with_mode =
-{
-	.name = "with_mode",
-	.cpu_funcs = {cpu_codelet, NULL},
-	.cpu_funcs_name = {"cpu_codelet", NULL},
-	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_W},
-};
-
-struct starpu_codelet cl_without_mode =
-{
-	.name = "without_mode",
-	.cpu_funcs = {cpu_codelet, NULL},
-	.cpu_funcs_name = {"cpu_codelet", NULL},
-	.nbuffers = 2
-};
-
-static
-int submit_codelet_task_insert(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-
-	ret = starpu_task_insert(&cl,
-				 STARPU_R, handles0,
-				 STARPU_W, handles1,
-				 0);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-static
-int submit_codelet_with_buffers(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-	struct starpu_task *task;
-
-	task = starpu_task_create();
-	task->cl = &cl;
-	task->buffers[0].handle = handles0;
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = handles1;
-	task->buffers[1].mode = STARPU_W;
-
-	ret = starpu_task_submit(task);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-static
-int submit_codelet_with_handles(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-	struct starpu_task *task;
-
-	task = starpu_task_create();
-	task->cl = &cl;
-	task->handles[0] = handles0;
-	task->handles[1] = handles1;
-
-	ret = starpu_task_submit(task);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-struct submit_task_func
-{
-	int (*func)(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1);
-	char *name;
-};
-
-static
-int submit_codelet(struct starpu_codelet cl, struct submit_task_func func)
-{
-	int *x, *y;
-	starpu_malloc((void**)&x, sizeof(*x));
-	starpu_malloc((void**)&y, sizeof(*y));
-	*x = 42;
-	*y = 14;
-	starpu_data_handle_t handles[2];
-	int ret;
-
-	starpu_variable_data_register(&handles[0], STARPU_MAIN_RAM, (uintptr_t)x, sizeof(*x));
-	starpu_variable_data_register(&handles[1], STARPU_MAIN_RAM, (uintptr_t)y, sizeof(*y));
-
-	ret = func.func(cl, handles[0], handles[1]);
-	starpu_data_unregister(handles[0]);
-	starpu_data_unregister(handles[1]);
-
-	if (!ret)
-	{
-		FPRINTF(stderr, "%s when executing codelet <%s> with func <%s>\n", *x==*y?"success":"error", cl.name, func.name);
-		ret = (*x != *y);
-	}
-
-	starpu_free(x);
-	starpu_free(y);
-
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int ret;
-	struct submit_task_func task_insert = { .func = submit_codelet_task_insert, .name = "task_insert" };
-	struct submit_task_func with_buffers = { .func = submit_codelet_with_buffers, .name = "with_buffers" };
-	struct submit_task_func with_handles = { .func = submit_codelet_with_handles, .name = "with_handles" };
-
-	ret = starpu_initialize(NULL, &argc, &argv);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	ret = submit_codelet(cl_with_mode, task_insert);
-	if (ret == -ENODEV)
-	{
-		starpu_shutdown();
-		fprintf(stderr, "WARNING: No one can execute this task\n");
-		return STARPU_TEST_SKIPPED;
-	}
-
-	if (!ret)
-	{
-		ret = submit_codelet(cl_with_mode, with_buffers);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_with_mode, with_handles);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_without_mode, task_insert);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_without_mode, with_buffers);
-	}
-	// We do not test the combination cl_without_mode with_handles as it is not expected to work
-
-	starpu_shutdown();
-
-	STARPU_RETURN(ret);
-}

+ 3 - 3
tests/main/restart.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -30,8 +30,8 @@
   #define N	10
 #endif
 
-struct timeval start;
-struct timeval end;
+static struct timeval start;
+static struct timeval end;
 
 int main(int argc, char **argv)
 {

+ 284 - 0
tools/starpu_paje_summary.Rmd

@@ -0,0 +1,284 @@
+<div id="table-of-contents">
+<h2>Table of Contents</h2>
+<div id="text-table-of-contents">
+<ul>
+<li><a href="#sec-1">1. Introduction</a>
+<ul>
+<li>
+<ul>
+<li><a href="#sec-1-0-1">1.0.1. How to compile</a></li>
+<li><a href="#sec-1-0-2">1.0.2. Software dependencies</a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><a href="#sec-2">2. Gantt Charts of the whole Trace</a></li>
+<li><a href="#sec-3">3. Table Summary</a></li>
+<li><a href="#sec-4">4. State Duration during the Execution Time</a></li>
+<li><a href="#sec-5">5. Distribution Histograms</a></li>
+</ul>
+</div>
+</div>
+```{r Setup, echo=FALSE}
+opts_chunk$set(echo=FALSE)
+```
+
+
+
+```{r Install_R_libraries}
+InstalledPackage <- function(package) 
+{
+    available <- suppressMessages(suppressWarnings(sapply(package, require, quietly = TRUE, character.only = TRUE, warn.conflicts = FALSE)))
+    missing <- package[!available]
+    if (length(missing) > 0) return(FALSE)
+    return(TRUE)
+}
+
+CRANChoosen <- function()
+{
+    return(getOption("repos")["CRAN"] != "@CRAN@")
+}
+
+UsePackage <- function(package, defaultCRANmirror = "http://cran.at.r-project.org") 
+{
+    if(!InstalledPackage(package))
+    {
+	if(!CRANChoosen())
+	{       
+	    chooseCRANmirror()
+	    if(!CRANChoosen())
+	    {
+		options(repos = c(CRAN = defaultCRANmirror))
+	    }
+	}
+
+	suppressMessages(suppressWarnings(install.packages(package)))
+	if(!InstalledPackage(package)) return(FALSE)
+    }
+    return(TRUE)
+}
+
+# Now install desired libraries
+libraries <- c("ggplot2", "plyr", "data.table", "RColorBrewer")
+for(libr in libraries) 
+{ 
+    if(!UsePackage(libr))
+    {
+	stop("Error!", libr)
+    }
+}
+```
+
+```{r Load_R_files}
+# Load ggplot and plyr just for the following cases
+   library(ggplot2)
+   library(plyr)
+   library(data.table)
+   library(RColorBrewer) 
+
+# Defining non-computation states:
+def_states<-c("Initializing","Deinitializing","Overhead","Nothing","Sleeping","Freeing","Allocating","WritingBack","FetchingInput","PushingOutput","Callback","Progressing","Unpartitioning","AllocatingReuse","Reclaiming","DriverCopy","DriverCopyAsync","Scheduling","Executing")
+
+# Function for reading .csv file
+read_df <- function(file,range1,range2) {
+  df<-read.csv(file, header=FALSE, strip.white=TRUE)
+  names(df) <- c("Nature","ResourceId","Type","Start","End","Duration", "Depth", "Value")
+  df = df[!(names(df) %in% c("Nature","Type", "Depth"))]
+  df$Origin<-as.factor(as.character(file))
+
+# Changing names if needed:
+  df$Value <- as.character(df$Value)
+  df$Value <- ifelse(df$Value == "F", "Freeing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "A", "Allocating", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "W", "WritingBack", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "No", "Nothing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "I", "Initializing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "D", "Deinitializing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Fi", "FetchingInput", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Po", "PushingOutput", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "C", "Callback", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "B", "Overhead", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Sc", "Scheduling", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "E", "Executing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Sl", "Sleeping", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "P", "Progressing", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "U", "Unpartitioning", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Ar", "AllocatingReuse", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "R", "Reclaiming", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "Co", "DriverCopy", as.character(df$Value))
+  df$Value <- ifelse(df$Value == "CoA", "DriverCopyAsync", as.character(df$Value))
+
+# Small cleanup
+df$Start<-round(df$Start,digit=1)
+df$End<-round(df$End,digit=1)
+df$ResourceId<-as.factor(df$ResourceId)
+df$Value<-as.factor(df$Value)
+
+# Start from zero
+  m <- min(df$Start)
+  df$Start <- df$Start - m
+  df$End <- df$Start+df$Duration
+
+# Return data frame
+  df
+}
+```
+
+```{r Load_traces}
+df<-data.frame()
+if( !exists("input_traces") )
+  input_traces<-c("example.native.trace.csv", "example.simgrid.trace.csv")
+
+for (i in 1:length(input_traces)){
+  dfs<-read_df(input_traces[i])
+  df<-rbindlist(list(df,dfs))
+}
+
+# Color palettes
+colourCount = length(unique(df$Value))
+getPalette = colorRampPalette(brewer.pal(9, "Set1"))
+
+# Order of Value so we can have good colors
+ker_states<-as.character(unique(df[!(df$Value %in% def_states),Value]))
+ordered_states<-append(sort(ker_states), def_states)
+df$Value <- factor(df$Value, levels=ordered_states)
+
+# Order of ResourceId so we can have y-axis
+df$ResourceId <- factor(df$ResourceId, levels=sort(as.character(unique(df$ResourceId))))
+```
+
+# Introduction
+
+This document presents a basic analysis of multiple StarPU
+traces. First, paje *traces* will be transferred into *.csv* files and
+then we analyze them with **R**. This summary is a first step that
+should help researchers verify their hypothesis or find problematic
+areas that require more exhaustive investigation.
+
+Be cautious, as the following results are only a brief analysis of
+the traces and many important phenomena could still be hidden. Also,
+be very careful when comparing different states or traces. Even
+though some large discrepancies can be irrelevant, in other cases
+even the smallest differences can be essential in understanding what
+exactly happened during the StarPU execution.
+
+### How to compile
+
+    ./starpu_summary.sh example.native.trace example.simgrid.trace
+
+### Software dependencies
+
+In order to run this analysis you need to have R installed:
+
+    sudo apt-get install r-base 
+
+Easiest way to transform *paje* traces generated by StarPU to *.csv* is to use *pjdump* program (<https://github.com/schnorr/pajeng>), so we encourage users to install it.
+
+When R is installed, one will need to start R (e.g., from terminal) and install *knitr* package:
+
+    R> install.packages("knitr")
+
+Additional R packages used in this analysis (*ggplot2, plyr, data.table, RColorBrewer*) will be installed automatically when the document is compiled for the first time. If there is any trouble, install them by hand directly from R (the same way as *knitr*)
+
+# Gantt Charts of the whole Trace
+
+First, we show a simple gantt chart of every trace. X-axis is a
+simple timeline of the execution, *Resources* on y-axis correspond
+to different CPUs/GPUs that were used and finally different colors
+represent different *States* of the application.
+
+This kind of figures can often point to the idle time or
+synchronization problems. Small disadvantage is that in most cases
+there are too many states, thus it is impossible to display them all
+on a single plot without aggregation. Therefore for any strange
+behavior at a certain part of the trace, we strongly suggest to zoom
+on the interval it occurred.
+
+```{r Gantt1}
+ggplot(df,aes(x=Start,xend=End, y=factor(ResourceId), yend=factor(ResourceId),color=Value)) + 
+ theme_bw() + scale_color_manual(name="State",values=getPalette(colourCount)) + 
+ geom_segment(size=8) + ylab("Resource") + xlab("Time [ms]") + 
+ facet_wrap(~Origin,ncol=1,scale="free_y")
+```
+
+Second, we will concentrate only on computation kernel states, to
+get rid of visualization artifacts that can be introduced by other
+(sometimes irrelevant) states. Normally, this plot should not be too
+different from the previous one.
+
+```{r Gantt2}
+# Select only computation kernels
+ df1 <- df[!(df$Value %in% c("Initializing","Deinitializing","Overhead","Nothing","Sleeping","Freeing","Allocating","WritingBack","FetchingInput","PushingOutput","Callback","Progressing","Unpartitioning","AllocatingReuse","Reclaiming","DriverCopy","DriverCopyAsync","Scheduling","Executing")),]
+
+# Start from zero
+  m <- min(df1$Start)
+  df1$Start <- df1$Start - m
+  df1$End <- df1$Start+df1$Duration
+
+# Plot
+ ggplot(df1,aes(x=Start,xend=End, y=factor(ResourceId), yend=factor(ResourceId),color=Value)) + 
+  theme_bw() + scale_color_manual(name="State",values=getPalette(colourCount)) + 
+  geom_segment(size=8) + ylab("Resource") + xlab("Time [ms]") + 
+  facet_wrap(~Origin,ncol=1,scale="free_y")
+```
+
+# Table Summary
+
+Here we present how much time application spent in each state
+(OverallDuration), how many times it was in that state (Count),
+mean and median values of duration (Mean and Median), and finally
+what is a standard deviation (StandDev).
+
+General information provided by this table can sometimes give an
+idea to application experts which parts of code are not working as
+desired. Be aware that this kind of tables hide many important
+things, such as outliers, multiple modes, etc.
+
+```{r Table}
+options(width=120)
+ddply(df,.(Value,Origin), summarize, OverallDuration=sum(Duration), Count=length(Duration), Mean=mean(Duration), Median=median(Duration), StandDev=sd(Duration))
+```
+
+# State Duration during the Execution Time
+
+Now, we show how duration of each state was changing during the
+execution. This can display a general behavior of a state; show if
+there are outliers or multiple modes; are some events occurring in
+groups, etc. . It can also suggest a strange behavior of a state
+during a certain time interval, which should be later investigated
+more carefully.
+
+  However, since each event is represented by a single point (and
+there is no "alpha" factor), those events that happen almost
+simultaneously are overplotted. Therefore density of events along
+execution time may not be easy to read.
+
+```{r Dur}
+ggplot(df,aes(x=Start,y=Duration)) + geom_point(aes(color=Value)) + theme_bw() + scale_color_manual(name="State",values=getPalette(colourCount)) + ggtitle("State Duration during the Execution Time") + theme(legend.position="none") + ylab("Duration [ms]") + xlab("Time [ms]") + facet_grid(Value~Origin, scale="free_y")
+```
+
+# Distribution Histograms
+
+Finally, we show a distribution of *Duration* for each state in form
+of histograms. X-axis is partitioned into bins with equidistant time
+intervals in milliseconds, while y-axis represents the number of
+occurrences inside such intervals for a certain state. Note that for
+the first plot y-axis is not fixed, meaning that the scale changes
+from one row to another. This plot allows to not only to see what
+was the most frequent duration of a state, but also to compare
+duration between different states.
+
+```{r Hist1}
+ggplot(df, aes(x=Duration)) + geom_histogram(aes(y=..count..,fill=factor(Value)),binwidth = diff(range(df$Duration))/30) + theme_bw() + scale_fill_manual(name="State",values=getPalette(colourCount)) + ggtitle("Histograms for State Distribution") + ylab("Count") + xlab("Duration [ms]") + theme(legend.position="none") + facet_grid(Value~Origin,scales = "free_y")
+```
+
+Similar to the previous figure, only now traces are showed vertically
+instead of horizontally. Note that for this plot x-axis is not fixed,
+meaning that the scale changes from one column to another. This plot
+allows to compare frequency of different states and in case of
+multiple traces to easily compare duration distribution for each
+state.
+
+```{r Hist2}
+ggplot(df, aes(x=Duration)) + geom_histogram(aes(y=..count..,fill=factor(Value)),binwidth = diff(range(df$Duration))/30) + theme_bw() + scale_fill_manual(name="State",values=getPalette(colourCount)) + ggtitle("Histograms for State Distribution") + ylab("Count") + xlab("Duration [ms]") + theme(legend.position="none") + facet_grid(Origin~Value,scales = "free_x")
+```

+ 109 - 0
tools/starpu_paje_summary.in

@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2014  Université Joseph Fourier
+# Copyright (C) 2014  Université Bordeaux
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Script for giving statistical analysis of the paje trace
+
+set -e # fail fast
+
+# File names
+basename="$PWD"
+outputfile="summary.html"
+analysis_script="$(dirname $(which $0))/starpu_paje_summary.Rmd"
+analysis_input=""
+
+# Command line arguments
+inputfiles=""
+
+help_script()
+{
+cat << EOF
+Give statistical analysis of the paje trace
+
+Options:
+   -h      Show this message
+
+Examples:
+$0 example.native.trace
+$0 example.native.trace example.simgrid.trace
+
+Report bugs to <@PACKAGE_BUGREPORT@>
+EOF
+}
+
+if [ "$1" = "--version" ] ; then
+    echo "$PROGNAME (@PACKAGE_NAME@) @PACKAGE_VERSION@"
+    exit 0
+fi
+
+if [ "$1" = "-h" ] || [ "$1" = "--help" ] || [ "$1" = "" ] ; then
+    help_script
+    exit 0
+fi
+
+while getopts "h" opt; do
+  case $opt in
+    \?)
+      echo "Invalid option: -$OPTARG"
+      help_script
+      exit 3
+      ;;
+  esac
+done
+
+# Reading files that need to be analyzed
+shift $((OPTIND - 1))
+inputfiles=$@
+# Error if there is no input files specified
+if [[ $# < 1 ]]; then
+    echo "Error!"
+    help_script
+    exit 2
+fi
+
+#####################################
+# Transforming input files into .csv
+for file in $inputfiles; do
+    if [ ! -s $file ]
+	then
+	echo "Error: file $file does not exist!"
+	exit 5
+    fi
+# Sorting traces
+    grep -e '^\(\(%\)\|\(\(1\|2\|3\|4\|5\|6\|7\|9\)\>\)\)' $file > start.trace
+    grep -e '^\(\(%\)\|\(\(1\|2\|3\|4\|5\|6\|7\|9\|18\|19\)\>\)\)' -v  $file > end.trace
+    sort -s -V --key=2,2 end.trace > endSorted.trace
+    cat start.trace endSorted.trace > outputSorted.trace
+
+# Transferring to .csv
+    pj_dump -n outputSorted.trace > $file.csv
+    perl -i -ne 'print if /^State/' $file.csv
+done
+
+analysis_input=`echo \"$inputfiles".csv\"" | sed 's/  */.csv", "/g'`
+
+#####################################
+# Running analysis file to get actual results
+Rscript -e "library(knitr); input_traces = c($analysis_input) ; outputhtml='$outputfile';\
+            outputRmd = gsub('.html\$','.Rmd',outputhtml);\
+            knit('$analysis_script',output=outputRmd); knitr::knit2html(outputRmd)"
+
+# Cleanup: delete temporary files
+rm -f outputSorted.trace
+rm -f start.trace
+rm -f end.trace
+rm -f endSorted.trace