Browse Source

- merge trunk

Olivier Aumage 11 years ago
parent
commit
3a272669cb
79 changed files with 1086 additions and 858 deletions
  1. 11 1
      ChangeLog
  2. 13 3
      configure.ac
  3. 4 0
      doc/doxygen/chapters/08scheduling.doxy
  4. 63 7
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  5. 0 4
      doc/doxygen/chapters/api/performance_model.doxy
  6. 5 5
      examples/audio/starpu_audio_processing.c
  7. 5 6
      examples/axpy/axpy.c
  8. 6 6
      examples/cg/cg.c
  9. 11 11
      examples/heat/dw_factolu.c
  10. 6 6
      examples/heat/dw_factolu_grain.c
  11. 6 6
      examples/heat/dw_factolu_tag.c
  12. 5 6
      examples/incrementer/incrementer.c
  13. 6 6
      examples/lu/lu_example.c
  14. 5 5
      examples/lu/xlu.c
  15. 5 5
      examples/lu/xlu_implicit.c
  16. 7 6
      examples/lu/xlu_implicit_pivot.c
  17. 5 5
      examples/lu/xlu_pivot.c
  18. 5 5
      examples/mandelbrot/mandelbrot.c
  19. 6 6
      examples/pi/pi.c
  20. 5 5
      examples/pi/pi_redux.c
  21. 10 10
      examples/ppm_downscaler/yuv_downscaler.c
  22. 18 18
      examples/spmv/dw_block_spmv.c
  23. 4 4
      examples/spmv/spmv.c
  24. 0 2
      include/starpu_perfmodel.h
  25. 2 0
      include/starpu_sched_ctx.h
  26. 27 11
      include/starpu_task.h
  27. 5 5
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  28. 6 6
      mpi/examples/mpi_lu/pxlu.c
  29. 5 5
      mpi/examples/mpi_lu/pxlu_implicit.c
  30. 1 1
      mpi/src/starpu_mpi_task_insert.c
  31. 3 3
      src/common/fxt.h
  32. 44 0
      src/common/utils.c
  33. 5 0
      src/common/utils.h
  34. 6 6
      src/core/dependencies/data_concurrency.c
  35. 67 80
      src/core/dependencies/implicit_data_deps.c
  36. 2 2
      src/core/dependencies/implicit_data_deps.h
  37. 15 6
      src/core/jobs.c
  38. 5 0
      src/core/jobs.h
  39. 14 30
      src/core/perfmodel/perfmodel.c
  40. 26 2
      src/core/perfmodel/perfmodel_bus.c
  41. 14 4
      src/core/perfmodel/perfmodel_history.c
  42. 32 11
      src/core/sched_ctx.c
  43. 6 4
      src/core/sched_policy.c
  44. 3 0
      src/core/simgrid.c
  45. 21 30
      src/core/task.c
  46. 5 5
      src/datawizard/coherency.c
  47. 8 1
      src/datawizard/coherency.h
  48. 3 1
      src/datawizard/filters.c
  49. 2 1
      src/datawizard/footprint.c
  50. 2 0
      src/datawizard/interfaces/block_interface.c
  51. 3 1
      src/datawizard/interfaces/data_interface.c
  52. 7 119
      src/datawizard/interfaces/matrix_interface.c
  53. 1 1
      src/datawizard/interfaces/void_interface.c
  54. 3 1
      src/datawizard/memory_nodes.c
  55. 2 2
      src/datawizard/user_interactions.c
  56. 11 7
      src/debug/traces/starpu_fxt.c
  57. 46 47
      src/drivers/cuda/driver_cuda.c
  58. 30 31
      src/drivers/driver_common/driver_common.c
  59. 3 3
      src/drivers/gordon/driver_gordon.c
  60. 1 1
      src/drivers/mp_common/source_common.c
  61. 44 46
      src/drivers/opencl/driver_opencl.c
  62. 1 1
      src/sched_policies/deque_modeling_policy_data_aware.c
  63. 6 6
      src/sched_policies/locality_work_stealing_policy.c
  64. 2 4
      src/sched_policies/work_stealing_policy.c
  65. 10 5
      src/util/starpu_task_insert.c
  66. 17 6
      src/util/starpu_task_insert_utils.c
  67. 1 1
      src/util/starpu_task_insert_utils.h
  68. 2 2
      src/worker_collection/worker_list.c
  69. 14 18
      src/worker_collection/worker_tree.c
  70. 1 2
      tests/Makefile.am
  71. 230 0
      tests/datawizard/variable_parameters.c
  72. 0 178
      tests/main/deprecated_buffer.c
  73. 3 3
      tests/main/restart.c
  74. 41 0
      tests/main/subgraph_repeat_regenerate_tag.c
  75. 28 0
      tests/main/subgraph_repeat_tag.c
  76. 2 2
      tests/microbenchs/matrix_as_vector.c
  77. 4 4
      tools/starpu_fxt_stats.c
  78. 18 12
      tools/starpu_fxt_tool.c
  79. 15 3
      tools/starpu_perfmodel_plot.c

+ 11 - 1
ChangeLog

@@ -48,6 +48,9 @@ New features:
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
   * New locality work stealing scheduler (lws).
+  * Add STARPU_VARIABLE_NBUFFERS to be set in cl.nbuffers, and nbuffers and
+    modes field to the task structure, which permit to define codelets taking a
+    variable number of data.
 
 Small features:
   * Tasks can now have a name (via the field const char *name of
@@ -83,6 +86,8 @@ Changes:
     starpu_data_set_tag(), data are received as a raw memory)
   * StarPU-MPI: Fix for being able to receive data with the same tag
     from several nodes (see mpi/tests/gather.c)
+  * Remove the long-deprecated cost_model fields and task->buffers field.
+  * Fix complexity of implicit task/data dependency, from quadratic to linear.
 
 Small changes:
   * Rename function starpu_trace_user_event() as
@@ -95,8 +100,13 @@ The scheduling context release
 New features:
   * One can register an existing on-GPU buffer to be used by a handle.
   * Add the starpu_paje_summary statistics tool.
+  * Enable gpu-gpu transfers for matrices.
 
-StarPU 1.1.2 (svn revision xxx)
+Small changes:
+  * Lock performance model files while writing and reading them to avoid
+    issues on parallel launches, MPI runs notably.
+
+StarPU 1.1.2 (svn revision 13011)
 ==============================================
 The scheduling context release
 

+ 13 - 3
configure.ac

@@ -54,8 +54,8 @@ AC_CANONICAL_SYSTEM
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl when they're available.
 m4_ifdef([AM_SILENT_RULES],
-  [AM_INIT_AUTOMAKE([1.11 -Wall foreign silent-rules color-tests parallel-tests subdir-objects])],
-  [AM_INIT_AUTOMAKE([1.10 -Wall foreign subdir-objects])])
+  [AM_INIT_AUTOMAKE([1.11 -Wall foreign silent-rules color-tests parallel-tests])],
+  [AM_INIT_AUTOMAKE([1.10 -Wall foreign])])
 
 m4_ifdef([AM_SILENT_RULES],
   [AM_SILENT_RULES(yes)])
@@ -88,8 +88,18 @@ AC_C_RESTRICT
 AC_CHECK_PROGS([BASH], [bash])
 
 # Check whether subversion is installed
+AC_PATH_PROG(svncommand, svn)
 AC_PATH_PROG(svnversioncommand, svnversion)
 
+# find out if we are are in a subversion directory
+svndir=0
+if test "$svncommand" != "" ; then
+   $svncommand info $srcdir >/dev/null 2>&1
+   if test $? -eq 0; then
+      svndir=1
+   fi
+fi
+
 # use svnversion to record the current repository revision only if
 # subversion is installed and we are in a working copy
 if test "$svnversioncommand" = "" || test "`LC_ALL=C $svnversioncommand -n $srcdir`" = "exported" ; then
@@ -1761,7 +1771,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 running_mpi_check=no
-if test -d "$srcdir/.svn" -o -d "$srcdir/.git" ; then
+if test $svndir = 1 -o -d "$srcdir/.git" ; then
     running_mpi_check=yes
 fi
 if test x$enable_mpi_check = xyes ; then

+ 4 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -128,6 +128,10 @@ the user can for instance run a given task a thousand times, measure the global
 consumption for that series of tasks, divide it by a thousand, repeat for
 varying kinds of tasks and task sizes, and eventually feed StarPU
 with these manual measurements through starpu_perfmodel_update_history().
+For instance, for CUDA devices, <c>nvidia-smi -q -d POWER</c> can be used to get
+the current consumption in Watt. Multiplying that value by the average duration
+of a single task gives the consumption of the task in Joules, which can be given
+to starpu_perfmodel_update_history().
 
 \section StaticScheduling Static Scheduling
 

+ 63 - 7
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -115,6 +115,11 @@ Defines the maximum number of buffers that tasks will be able to take
 as parameters. The default value is 8, it can be changed by using the
 configure option \ref enable-maxbuffers "--enable-maxbuffers".
 
+\def STARPU_VARIABLE_NBUFFERS
+\ingroup API_Codelet_And_Tasks
+Value to set in starpu_codelet::nbuffers to specify that the codelet can accept
+a variable number of buffers, specified in starpu_task::nbuffers.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
@@ -270,11 +275,14 @@ starpu_codelet::cpu_funcs_name is non-NULL, in which case StarPU will
 simply make a symbol lookup to get the implementation.
 
 \var starpu_codelet::nbuffers
-Specify the number of arguments taken by the codelet. These arguments
-are managed by the DSM and are accessed from the <c>void *buffers[]</c>
-array. The constant argument passed with the field starpu_task::cl_arg
-is not counted in this number. This value should not be above
-\ref STARPU_NMAXBUFS.
+Specify the number of arguments taken by the codelet. These arguments are
+managed by the DSM and are accessed from the <c>void *buffers[]</c> array. The
+constant argument passed with the field starpu_task::cl_arg is not counted in
+this number. This value should not be above \ref STARPU_NMAXBUFS. It may be set
+to STARPU_VARIABLE_NBUFFERS to specify that the number of buffers and their
+access modes will be set in starpu_task::nbuffers and starpu_task::modes or
+starpu_task::dyn_modes, which thus permits to define codelets with a varying
+number of data.
 
 \var starpu_codelet::modes
 Is an array of ::starpu_data_access_mode. It describes the required
@@ -362,27 +370,33 @@ starpu_task_create(), or declared statically. In the latter case, the
 programmer has to zero the structure starpu_task and to fill the
 different fields properly. The indicated default values correspond to
 the configuration of a task allocated with starpu_task_create().
+
 \var starpu_task::name
 Optional name of the task. This can be useful for debugging
 purposes.
+
 \var starpu_task::cl
 Is a pointer to the corresponding structure starpu_codelet. This
 describes where the kernel should be executed, and supplies the
 appropriate implementations. When set to NULL, no code is executed
 during the tasks, such empty tasks can be useful for synchronization
 purposes.
-\var starpu_task::buffers
-\deprecated
 This field has been made deprecated. One should use instead the
 field starpu_task::handles to specify the data handles accessed
 by the task. The access modes are now defined in the field
 starpu_codelet::modes.
+
+\var starpu_task::nbuffers
+Specifies the number of buffers. This is only used when starpu_codelet::nbuffers
+is STARPU_VARIABLE_NBUFFERS.
+
 \var starpu_task::handles
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 different pieces of data accessed by the task. The number of entries
 in this array must be specified in the field starpu_codelet::nbuffers,
 and should not exceed \ref STARPU_NMAXBUFS. If unsufficient, this value can
 be set with the configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
 \var starpu_task::dyn_handles
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 different pieces of data accessed by the task. The number of entries
@@ -401,6 +415,25 @@ The actual data pointers to the memory node where execution will
 happen, managed by the DSM. Is used when the field
 starpu_task::dyn_handles is defined.
 
+\var starpu_task::modes
+Is used only when starpu_codelet::nbuffers is STARPU_VARIABLE_NBUFFERS.
+It is an array of ::starpu_data_access_mode. It describes the required
+access modes to the data neeeded by the codelet (e.g. ::STARPU_RW). The
+number of entries in this array must be specified in the field
+starpu_task::nbuffers, and should not exceed \ref STARPU_NMAXBUFS. If
+unsufficient, this value can be set with the configure option
+\ref enable-maxbuffers "--enable-maxbuffers".
+
+\var starpu_task::dyn_modes
+Is used only when starpu_codelet::nbuffers is STARPU_VARIABLE_NBUFFERS.
+It is an array of ::starpu_data_access_mode. It describes the required
+access modes to the data needed by the codelet (e.g. ::STARPU_RW).
+The number of entries in this array must be specified in the field
+starpu_codelet::nbuffers. This field should be used for codelets having a
+number of datas greater than \ref STARPU_NMAXBUFS (see \ref
+SettingTheDataHandlesForATask). When defining a codelet, one
+should either define this field or the field starpu_task::modes defined above.
+
 \var starpu_task::cl_arg
 Optional pointer which is passed to the codelet through the second
 argument of the codelet implementation (e.g. starpu_codelet::cpu_func
@@ -612,6 +645,11 @@ It is possible to initialize statically allocated tasks with
 this value. This is equivalent to initializing a structure starpu_task
 with the function starpu_task_init() function.
 
+\def STARPU_TASK_GET_NBUFFERS(task)
+\ingroup API_Codelet_And_Tasks
+Return the number of buffers for this task, i.e. starpu_codelet::nbuffers, or
+starpu_task::nbuffers if the former is STARPU_VARIABLE_BUFFERS.
+
 \def STARPU_TASK_GET_HANDLE(task, i)
 \ingroup API_Codelet_And_Tasks
 Return the \p i th data handle of the given task. If the task
@@ -647,6 +685,24 @@ starpu_codelet::modes or the \p i th element of the field
 starpu_codelet::dyn_modes (see \ref
 SettingTheDataHandlesForATask)
 
+\def STARPU_TASK_GET_MODE(codelet, i)
+\ingroup API_Codelet_And_Tasks
+Return the access mode of the \p i th data handle of the given
+task. If the task is defined with a static or dynamic number of
+handles, will either return the \p i th element of the field
+starpu_task::modes or the \p i th element of the field
+starpu_task::dyn_modes (see \ref
+SettingTheDataHandlesForATask)
+
+\def STARPU_TASK_SET_MODE(task, mode, i)
+\ingroup API_Codelet_And_Tasks
+Set the access mode of the \p i th data handle of the given
+task. If the task is defined with a static or dynamic number of
+handles, will either set the \p i th element of the field
+starpu_task::modes or the \p i th element of the field
+starpu_task::dyn_modes (see \ref
+SettingTheDataHandlesForATask)
+
 \fn struct starpu_task *starpu_task_create(void)
 \ingroup API_Codelet_And_Tasks
 Allocate a task structure and initialize it with default

+ 0 - 4
doc/doxygen/chapters/api/performance_model.doxy

@@ -94,8 +94,6 @@ arch-specific factor.
 is the symbol name for the performance model, which will be used as
 file name to store the model. It must be set otherwise the model will
 be ignored.
-\var starpu_perfmodel::cost_model
-\deprecated
 This field is deprecated. Use instead the field starpu_perfmodel::cost_function field.
 \var starpu_perfmodel::cost_function
 Used by ::STARPU_COMMON: takes a task and implementation number, and
@@ -158,8 +156,6 @@ number of sample values for non-linear regression
 contains information about the performance model of a given
 arch.
 \ingroup API_Performance_Model
-\var starpu_perfmodel_per_arch::cost_model
-\deprecated
 This field is deprecated. Use instead the field
 starpu_perfmodel_per_arch::cost_function.
 \var starpu_perfmodel_per_arch::cost_function

+ 5 - 5
examples/audio/starpu_audio_processing.c

@@ -59,8 +59,8 @@ float *A;
 starpu_data_handle_t A_handle;
 
 /* For performance evaluation */
-static struct timeval start;
-static struct timeval end;
+static double start;
+static double end;
 static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
 
 /* 
@@ -426,7 +426,7 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < niter; iter++)
 		starpu_data_set_wt_mask(starpu_data_get_sub_data(A_handle, 1, iter), 1<<0);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (iter = 0; iter < niter; iter++)
 	{
@@ -435,9 +435,9 @@ int main(int argc, char **argv)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
 
 	int worker;

+ 5 - 6
examples/axpy/axpy.c

@@ -166,10 +166,10 @@ int main(int argc, char **argv)
 	starpu_data_partition(_handle_x, &block_filter);
 	starpu_data_partition(_handle_y, &block_filter);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned b;
 	for (b = 0; b < NBLOCKS; b++)
@@ -202,9 +202,8 @@ enodev:
 	starpu_data_unregister(_handle_x);
 	starpu_data_unregister(_handle_y);
 
-	gettimeofday(&end, NULL);
-        double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
-                                        (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+        double timing = end - start;
 
 	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
 

+ 6 - 6
examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -294,9 +294,9 @@ static int cg(void)
 	FPRINTF(stderr, "*************** INITIAL ************ \n");
 	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
 
-	struct timeval start;
-	struct timeval end;
-	gettimeofday(&start, NULL);
+	double start;
+	double end;
+	start = starpu_timing_now();
 
 	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
 	{
@@ -351,9 +351,9 @@ static int cg(void)
 		i++;
 	}
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
 	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
 	return 0;

+ 11 - 11
examples/heat/dw_factolu.c

@@ -30,12 +30,12 @@ struct starpu_perfmodel model_12;
 struct starpu_perfmodel model_21;
 struct starpu_perfmodel model_22;
 
-unsigned *advance_11; /* size nblocks, whether the 11 task is done */
-unsigned *advance_12_21; /* size nblocks*nblocks */
-unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
+static unsigned *advance_11; /* size nblocks, whether the 11 task is done */
+static unsigned *advance_12_21; /* size nblocks*nblocks */
+static unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
 static unsigned no_prio = 0;
 
@@ -618,7 +618,7 @@ void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
 	args->nblocks = nblocks;
 	args->dataA = dataA;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* inject a new task with this codelet into the system */ 
 	struct starpu_task *task = starpu_task_create();
@@ -635,9 +635,9 @@ void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
@@ -664,7 +664,7 @@ void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
 	args->nblocks = nblocks;
 	args->dataA = dataA;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* inject a new task with this codelet into the system */ 
 	struct starpu_task *task = starpu_task_create();
@@ -685,9 +685,9 @@ void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 6 - 6
examples/heat/dw_factolu_grain.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -345,18 +345,18 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 	memcpy(Asaved, matA, ld*ld*sizeof(float));
 #endif
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* that's only ok for powers of 2 yet ! */
 	dw_factoLU_grain_inner(matA, size, (size/nblocks) * nbigblocks, ld, size/nblocks, 0);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 6 - 6
examples/heat/dw_factolu_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -222,8 +222,8 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	int ret;
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -261,7 +261,7 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
@@ -274,9 +274,9 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	printf("%2.2f\n", timing/1000);
 

+ 5 - 6
examples/incrementer/incrementer.c

@@ -80,10 +80,10 @@ int main(int argc, char **argv)
 		.name = "increment"
 	};
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned i;
 	for (i = 0; i < niter; i++)
@@ -109,7 +109,7 @@ int main(int argc, char **argv)
 	/* update the array in RAM */
 	starpu_data_unregister(float_array_handle);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
@@ -120,8 +120,7 @@ int main(int argc, char **argv)
 		ret = 1;
 	}
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
-					(end.tv_usec - start.tv_usec));
+	double timing = end - start;
 
 	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
 

+ 6 - 6
examples/lu/lu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -346,16 +346,16 @@ int main(int argc, char **argv)
 		}
 		else
 		{
-			struct timeval start;
-			struct timeval end;
+			double start;
+			double end;
 
-			gettimeofday(&start, NULL);
+			start = starpu_timing_now();
 
 			ret = STARPU_LU(lu_decomposition_pivot)(A, ipiv, size, size, nblocks);
 
-			gettimeofday(&end, NULL);
+			end = starpu_timing_now();
 
-			double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+			double timing = end - start;
 
 			unsigned n = size;
 			double flop = (2.0f*n*n*n)/3.0f;

+ 5 - 5
examples/lu/xlu.c

@@ -170,8 +170,8 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	int ret;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -213,7 +213,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (ret == -ENODEV) return ret;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -221,9 +221,9 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 5 - 5
examples/lu/xlu_implicit.c

@@ -110,14 +110,14 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 
 static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -142,9 +142,9 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 7 - 6
examples/lu/xlu_implicit_pivot.c

@@ -155,14 +155,15 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 				  starpu_data_handle_t (* get_block)(starpu_data_handle_t *, unsigned, unsigned, unsigned),
 				  double *timing)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 
-	gettimeofday(&start, NULL);
-
 	/* create all the DAG nodes */
 	unsigned i,j,k;
+
+	start = starpu_timing_now();
+
 	for (k = 0; k < nblocks; k++)
 	{
 	     ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
@@ -196,9 +197,9 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	/* stall the application until the end of computations */
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	*timing = end - start;
 	return 0;
 }
 

+ 5 - 5
examples/lu/xlu_pivot.c

@@ -232,8 +232,8 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 {
 	int ret;
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -298,7 +298,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
@@ -307,9 +307,9 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 /*	starpu_task_wait_for_all(); */
 	free(tags);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	*timing = end - start;
 	return 0;
 }
 

+ 5 - 5
examples/mandelbrot/mandelbrot.c

@@ -506,10 +506,10 @@ int main(int argc, char **argv)
 
 	unsigned iter = 0;
 
-	struct timeval start, end;
+	double start, end;
 
 	if (demo)
-		gettimeofday(&start, NULL);
+		start = starpu_timing_now();
 
 	while (niter-- != 0)
 	{
@@ -573,15 +573,15 @@ int main(int argc, char **argv)
 				topY = -49.35016705749115;
 				bottomY = 49.64891691946615;
 
-				gettimeofday(&end, NULL);
-				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+				end = starpu_timing_now();
+				double timing = end - start;
 
 				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
 
 				/* Reset counters */
 				iter = 0;
-				gettimeofday(&start, NULL);
+				start = starpu_timing_now();
 			}
 			else
 			{

+ 6 - 6
examples/pi/pi.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -156,10 +156,10 @@ int main(int argc, char **argv)
 	
 	starpu_data_partition(cnt_array_handle, &f);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (i = 0; i < ntasks; i++)
 	{
@@ -188,9 +188,9 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 		total_cnt += cnt_array[i];
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 
 	unsigned long total_shot_cnt = ntasks * nshot_per_task;
 

+ 5 - 5
examples/pi/pi_redux.c

@@ -340,8 +340,8 @@ int main(int argc, char **argv)
 	starpu_data_set_reduction_methods(shot_cnt_handle,
 					&redux_codelet, &init_codelet);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	for (i = 0; i < ntasks_warmup; i++)
 	{
@@ -357,7 +357,7 @@ int main(int argc, char **argv)
 	}
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (i = 0; i < ntasks; i++)
 	{
@@ -375,8 +375,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(shot_cnt_handle);
 	starpu_data_unregister(xy_scratchpad_handle);
 
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	double timing = end - start;
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
 	 * probability to impact the disk: pi/4 */
 	unsigned long total = (ntasks + ntasks_warmup)*nshot_per_task;

+ 10 - 10
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -27,13 +27,13 @@
 
 #include "yuv_downscaler.h"
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
-const char *filename_in_default = "hugefile.2s.yuv";
-const char *filename_out_default = "hugefile.2s.out.yuv";
-char filename_in[1024];
-char filename_out[1024];
+static const char *filename_in_default = "hugefile.2s.yuv";
+static const char *filename_out_default = "hugefile.2s.out.yuv";
+static char filename_in[1024];
+static char filename_out[1024];
 
 void parse_args(int argc, char **argv)
 {
@@ -206,7 +206,7 @@ int main(int argc, char **argv)
 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
 
 	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* do the computation */
 	for (frame = 0; frame < nframes; frame++)
@@ -275,9 +275,9 @@ int main(int argc, char **argv)
 	/* There is an implicit barrier: the unregister methods will block
 	 * until the computation is done and that the result was put back into
 	 * memory. */
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	fprintf(stderr, "Computation took %f seconds\n", timing/1000000);
 	fprintf(stderr, "FPS %f\n", (1000000*nframes)/timing);
 

+ 18 - 18
examples/spmv/dw_block_spmv.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,25 +21,25 @@
 #include "matrix_market/mm_to_bcsr.h"
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
-sem_t sem;
+static sem_t sem;
 
-unsigned c = 256;
-unsigned r = 256;
+static unsigned c = 256;
+static unsigned r = 256;
 
-unsigned remainingtasks = -1;
+static unsigned remainingtasks = -1;
 
-starpu_data_handle_t sparse_matrix;
-starpu_data_handle_t vector_in, vector_out;
+static starpu_data_handle_t sparse_matrix;
+static starpu_data_handle_t vector_in, vector_out;
 
-uint32_t size;
-char *inputfile;
-bcsr_t *bcsr_matrix;
+static uint32_t size;
+static char *inputfile;
+static bcsr_t *bcsr_matrix;
 
-float *vector_in_ptr;
-float *vector_out_ptr;
+static float *vector_in_ptr;
+static float *vector_out_ptr;
 
 void create_data(void)
 {
@@ -96,7 +96,7 @@ void init_problem_callback(void *arg)
 	if ( val == 0 )
 	{
 		printf("DONE ...\n");
-		gettimeofday(&end, NULL);
+		end = starpu_timing_now();
 
 /*		starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); */
 		starpu_data_unpartition(vector_out, STARPU_MAIN_RAM);
@@ -181,7 +181,7 @@ void launch_spmv_codelets(void)
 	uint32_t *rowptr = starpu_bcsr_get_local_rowptr(sparse_matrix);
 	uint32_t *colind = starpu_bcsr_get_local_colind(sparse_matrix);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned loop;
 	for (loop = 0; loop < NSPMV; loop++)
@@ -318,7 +318,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 
 	double totalflop = 2.0*c*r*totaltasks;
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 	FPRINTF(stderr, "Flop %e\n", totalflop);

+ 4 - 4
examples/spmv/spmv.c

@@ -115,7 +115,7 @@ int main(int argc, char **argv)
 	int ret;
 	unsigned part;
 	double timing;
-	struct timeval start, end;
+	double start, end;
 	unsigned row, pos;
 	unsigned ind;
 
@@ -213,7 +213,7 @@ int main(int argc, char **argv)
 	compile_spmv_opencl_kernel();
 #endif
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/*
 	 *	Create and submit StarPU tasks
@@ -236,7 +236,7 @@ int main(int argc, char **argv)
 	}
 
 	starpu_task_wait_for_all();
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	/*
 	 *	Unregister the CSR matrix and the output vector
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 	 */
 	starpu_shutdown();
 
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 0 - 2
include/starpu_perfmodel.h

@@ -93,7 +93,6 @@ struct starpu_perfmodel_history_table;
 
 struct starpu_perfmodel_per_arch
 {
-	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
@@ -119,7 +118,6 @@ struct starpu_perfmodel
 {
 	enum starpu_perfmodel_type type;
 
-	double (*cost_model)(struct starpu_data_descr *) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);

+ 2 - 0
include/starpu_sched_ctx.h

@@ -135,6 +135,8 @@ void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
 
 void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
 
+int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */

+ 27 - 11
include/starpu_task.h

@@ -82,6 +82,8 @@ typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
 #define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS ((starpu_opencl_func_t) -1)
 
+#define STARPU_VARIABLE_NBUFFERS (-1)
+
 struct starpu_task;
 struct starpu_codelet
 {
@@ -104,7 +106,7 @@ struct starpu_codelet
 
 	char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
-	unsigned nbuffers;
+	int nbuffers;
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode *dyn_modes;
 
@@ -126,14 +128,15 @@ struct starpu_task
 
 	struct starpu_codelet *cl;
 
-	/* TODO: remove someday, this is costly */
-	struct starpu_data_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
+	int nbuffers;
 
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
+	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 
 	starpu_data_handle_t *dyn_handles;
 	void **dyn_interfaces;
+	enum starpu_data_access_mode *dyn_modes;
 
 	void *cl_arg;
 	size_t cl_arg_size;
@@ -166,12 +169,12 @@ struct starpu_task
 	unsigned destroy:1;
 	unsigned regenerate:1;
 
+	unsigned workerid;
+
 	unsigned scheduled:1;
 
 	unsigned int mf_skip:1;
 
-	unsigned workerid;
-
 	int priority;
 
 	enum starpu_task_status status;
@@ -224,17 +227,30 @@ struct starpu_task
 	.scheduled = 0,					\
 	.dyn_handles = NULL,				\
 	.dyn_interfaces = NULL,				\
+	.dyn_modes = NULL,				\
 	.name = NULL                        		\
 }
 
-#define STARPU_TASK_GET_HANDLE(task, i) ((task->dyn_handles) ? task->dyn_handles[i] : task->handles[i])
-#define STARPU_TASK_SET_HANDLE(task, handle, i) do { if (task->dyn_handles) task->dyn_handles[i] = handle; else task->handles[i] = handle; } while(0)
+#define STARPU_TASK_GET_NBUFFERS(task) ((unsigned)((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? ((task)->nbuffers) : ((task)->cl->nbuffers)))
+
+#define STARPU_TASK_GET_HANDLE(task, i) (((task)->dyn_handles) ? (task)->dyn_handles[i] : (task)->handles[i])
+#define STARPU_TASK_SET_HANDLE(task, handle, i) do { if ((task)->dyn_handles) (task)->dyn_handles[i] = handle; else (task)->handles[i] = handle; } while(0)
+
+#define STARPU_CODELET_GET_MODE(codelet, i) (((codelet)->dyn_modes) ? (codelet)->dyn_modes[i] : (codelet)->modes[i])
+#define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if ((codelet)->dyn_modes) (codelet)->dyn_modes[i] = mode; else (codelet)->modes[i] = mode; } while(0)
 
-#define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
-#define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
+#define STARPU_TASK_GET_MODE(task, i) ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS ? \
+						(((task)->dyn_modes) ? (task)->dyn_modes[i] : (task)->modes[i]) : \
+						STARPU_CODELET_GET_MODE((task)->cl, i) )
+#define STARPU_TASK_SET_MODE(task, mode, i) do { \
+					if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS) \
+						if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
+					else \
+						STARPU_CODELET_SET_MODE((task)->cl, mode, i); \
+					} while(0)
 
-#define STARPU_CODELET_GET_NODE(codelet, i) ((codelet->dyn_nodes) ? codelet->dyn_nodes[i] : codelet->nodes[i])
-#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if (codelet->dyn_nodes) codelet->dyn_nodes[i] = __node; else codelet->nodes[i] = __node; } while(0)
+#define STARPU_CODELET_GET_NODE(codelet, i) (((codelet)->dyn_nodes) ? (codelet)->dyn_nodes[i] : (codelet)->nodes[i])
+#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if ((codelet)->dyn_nodes) (codelet)->dyn_nodes[i] = __node; else (codelet)->nodes[i] = __node; } while(0)
 
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);

+ 5 - 5
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -67,8 +67,8 @@ static struct starpu_codelet cl22 =
  */
 void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	starpu_data_handle_t **data_handles;
 	unsigned x,y,i,j,k;
 
@@ -104,7 +104,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	}
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -161,11 +161,11 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	free(data_handles);
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	if (rank == 0)
 	{
-		*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+		*timing = end - start;
 		*flops = (1.0f*size*size*size)/3.0f;
 	}
 }

+ 6 - 6
mpi/examples/mpi_lu/pxlu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -821,8 +821,8 @@ static void wait_termination(void)
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	nblocks = _nblocks;
 	rank = _rank;
@@ -854,15 +854,15 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
 
 	wait_termination();
 	
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
 	

+ 5 - 5
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -115,8 +115,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	nblocks = _nblocks;
 	rank = _rank;
@@ -127,7 +127,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -160,9 +160,9 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
 	

+ 1 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -494,7 +494,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	{
 		/* Get the number of buffers and the size of the arguments */
 		va_copy(varg_list_copy, varg_list);
-		arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list_copy);
+		_starpu_task_insert_get_args_size(varg_list_copy, NULL, &arg_buffer_size);
 		va_end(varg_list_copy);
 
 		/* Pack arguments if needed */

+ 3 - 3
src/common/fxt.h

@@ -407,8 +407,8 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode)	\
 	FUT_DO_PROBE5(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, _starpu_gettid());
 
-#define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
+#define _STARPU_TRACE_WORKER_INIT_END(__workerid)				\
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid));
 
 #define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
 do {									\
@@ -424,7 +424,7 @@ do {									\
 	{								\
 		if ((job)->task->cl)					\
 		{							\
-			const int __nbuffers = (job)->task->cl->nbuffers;	\
+			const int __nbuffers = STARPU_TASK_GET_NBUFFERS((job)->task);	\
 			char __buf[FXT_MAX_PARAMS*sizeof(long)];	\
 			int __i;					\
 			for (__i = 0; __i < __nbuffers; __i++)		\

+ 44 - 0
src/common/utils.c

@@ -21,6 +21,7 @@
 #include <libgen.h>
 #include <errno.h>
 #include <unistd.h>
+#include <fcntl.h>
 
 #ifdef __MINGW32__
 #include <io.h>
@@ -102,6 +103,49 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 	}
 }
 
+int _starpu_ftruncate(FILE *file)
+{
+	return ftruncate(fileno(file), 0);
+}
+
+int _starpu_frdlock(FILE *file)
+{
+	struct flock lock = {
+		.l_type = F_RDLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+}
+
+int _starpu_frdunlock(FILE *file)
+{
+	struct flock lock = {
+		.l_type = F_UNLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+}
+
+int _starpu_fwrlock(FILE *file)
+{
+	struct flock lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+}
+
+int _starpu_fwrunlock(FILE *file)
+{
+	return _starpu_frdunlock(file);
+}
+
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex)
 {
 	int ret;

+ 5 - 0
src/common/utils.h

@@ -105,6 +105,11 @@
 
 int _starpu_mkpath(const char *s, mode_t mode);
 void _starpu_mkpath_and_check(const char *s, mode_t mode);
+int _starpu_ftruncate(FILE *file);
+int _starpu_frdlock(FILE *file);
+int _starpu_frdunlock(FILE *file);
+int _starpu_fwrlock(FILE *file);
+int _starpu_fwrunlock(FILE *file);
 char *_starpu_get_home_path(void);
 void _starpu_gethostname(char *hostname, size_t size);
 

+ 6 - 6
src/core/dependencies/data_concurrency.c

@@ -209,7 +209,7 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 {
 	unsigned buf;
 
-	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	{
 		if (buf)
@@ -241,18 +241,18 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 {
 	struct starpu_codelet *cl = j->task->cl;
 
-	if ((cl == NULL) || (cl->nbuffers == 0))
+	if ((cl == NULL) || (STARPU_TASK_GET_NBUFFERS(j->task) == 0))
 		return 0;
 
 	/* Compute an ordered list of the different pieces of data so that we
 	 * grab then according to a total order, thus avoiding a deadlock
 	 * condition */
 	unsigned i;
-	for (i=0 ; i<cl->nbuffers ; i++)
+	for (i=0 ; i<STARPU_TASK_GET_NBUFFERS(j->task); i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		int node = -1;
 		if (j->task->cl->specific_nodes)
@@ -260,7 +260,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 		_STARPU_JOB_SET_ORDERED_BUFFER_NODE(j, node, i);
 	}
 
-	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);
+	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), STARPU_TASK_GET_NBUFFERS(j->task));
 
 	return _submit_job_enforce_data_deps(j, 0);
 }
@@ -268,7 +268,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 {
 	struct _starpu_job *j = r->j;
-	unsigned nbuffers = j->task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
 	unsigned buffer_index = r->buffer_index;
 
 	if (buffer_index + 1 < nbuffers)

+ 67 - 80
src/core/dependencies/implicit_data_deps.c

@@ -47,13 +47,14 @@ static void _starpu_add_dependency(starpu_data_handle_t handle STARPU_ATTRIBUTE_
 }
 
 /* Add pre_sync_task as new accessor among the existing ones, making it depend on the last synchronization task if any.  */
-static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot)
 {
 	/* Add this task to the list of readers */
-	struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
-	link->task = post_sync_task;
-	link->next = handle->last_submitted_accessors;
-	handle->last_submitted_accessors = link;
+	post_sync_task_dependency_slot->task = post_sync_task;
+	post_sync_task_dependency_slot->next = handle->last_submitted_accessors.next;
+	post_sync_task_dependency_slot->prev = &handle->last_submitted_accessors;
+	post_sync_task_dependency_slot->next->prev = post_sync_task_dependency_slot;
+	handle->last_submitted_accessors.next = post_sync_task_dependency_slot;
 
 	/* This task depends on the previous synchronization task if any */
 	if (handle->last_sync_task && handle->last_sync_task != post_sync_task)
@@ -103,9 +104,9 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 {
 	/* Count the existing accessors */
 	unsigned naccessors = 0;
-	struct _starpu_task_wrapper_list *l;
-	l = handle->last_submitted_accessors;
-	while (l)
+	struct _starpu_task_wrapper_dlist *l;
+	l = handle->last_submitted_accessors.next;
+	while (l != &handle->last_submitted_accessors)
 	{
 		if (l->task != post_sync_task)
 			naccessors++;
@@ -118,8 +119,8 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 		/* Put all tasks in the list into task_array */
 		struct starpu_task *task_array[naccessors];
 		unsigned i = 0;
-		l = handle->last_submitted_accessors;
-		while (l)
+		l = handle->last_submitted_accessors.next;
+		while (l != &handle->last_submitted_accessors)
 		{
 			STARPU_ASSERT(l->task);
 			if (l->task != post_sync_task)
@@ -129,9 +130,10 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 				_STARPU_DEP_DEBUG("dep %p -> %p\n", l->task, pre_sync_task);
 			}
 
-			struct _starpu_task_wrapper_list *prev = l;
+			struct _starpu_task_wrapper_dlist *prev = l;
 			l = l->next;
-			free(prev);
+			prev->next = NULL;
+			prev->prev = NULL;
 		}
 		_starpu_task_declare_deps_array(pre_sync_task, naccessors, task_array, 0);
 	}
@@ -156,7 +158,8 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
 		handle->last_submitted_ghost_accessors_id = NULL;
 	}
 
-	handle->last_submitted_accessors = NULL;
+	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 	handle->last_sync_task = post_sync_task;
 
 	if (!post_sync_task->cl) {
@@ -177,7 +180,7 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
  * */
 /* NB : handle->sequential_consistency_mutex must be hold by the caller;
  * returns a task, to be submitted after releasing that mutex. */
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 						   starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	struct starpu_task *task = NULL;
@@ -228,15 +231,16 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		{
 			_STARPU_DEP_DEBUG("concurrently\n");
 			/* Can access concurrently with current tasks */
-			_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+			_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 		}
 		else
 		{
 			/* Can not access concurrently, have to wait for existing accessors */
-			struct _starpu_task_wrapper_list *l = handle->last_submitted_accessors;
+			struct _starpu_task_wrapper_dlist *l = handle->last_submitted_accessors.next;
 			_STARPU_DEP_DEBUG("dependency\n");
 
-			if ((l && l->next) || (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next))
+			if ((l != &handle->last_submitted_accessors && l->next != &handle->last_submitted_accessors)
+					|| (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next))
 			{
 				/* Several previous accessors */
 
@@ -261,7 +265,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					/* Make this task wait for the previous ones */
 					_starpu_add_sync_task(handle, sync_task, sync_task);
 					/* And the requested task wait for this one */
-					_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+					_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 
 					task = sync_task;
 				}
@@ -270,11 +274,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 			{
 				/* One previous accessor, make it the sync
 				 * task, and start depending on it. */
-				if (l)
+				if (l != &handle->last_submitted_accessors)
 				{
 					handle->last_sync_task = l->task;
-					handle->last_submitted_accessors = NULL;
-					free(l);
+					l->next = NULL;
+					l->prev = NULL;
+					handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+					handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 				}
 				else if (handle->last_submitted_ghost_accessors_id)
 				{
@@ -283,7 +289,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					free(handle->last_submitted_ghost_accessors_id);
 					handle->last_submitted_ghost_accessors_id = NULL;
 				}
-				_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
+				_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
 			}
 		}
 		handle->last_submitted_mode = mode;
@@ -307,13 +313,14 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	if (j->reduction_task)
 		return;
 
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	struct _starpu_task_wrapper_dlist *dep_slots = _STARPU_JOB_GET_DEP_SLOTS(j);
 
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 		struct starpu_task *new_task;
 
 		/* Scratch memory does not introduce any deps */
@@ -321,7 +328,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 			continue;
 
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, &dep_slots[buffer], handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
 		{
@@ -341,7 +348,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
  * if h is submitted after the termination of f or g, StarPU will not create a
  * dependency as this is not needed anymore. */
 /* the sequential_consistency_mutex of the handle has to be already held */
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle)
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, struct _starpu_task_wrapper_dlist *task_dependency_slot, starpu_data_handle_t handle)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 
@@ -365,63 +372,35 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 			}
 		}
 
-		/* XXX can a task be both the last writer associated to a data
-		 * and be in its list of readers ? If not, we should not go
-		 * through the entire list once we have detected it was the
-		 * last writer. */
-
 		/* Same if this is one of the readers: we go through the list
 		 * of readers and remove the task if it is found. */
-		struct _starpu_task_wrapper_list *l;
-		l = handle->last_submitted_accessors;
-		struct _starpu_task_wrapper_list *prev = NULL;
-#ifdef STARPU_DEVEL
-#warning TODO: use double-linked list to make finding ourself fast
-#endif
-		while (l)
+		if (task_dependency_slot && task_dependency_slot->next)
 		{
-			struct _starpu_task_wrapper_list *next = l->next;
-
-			if (l->task == task)
-			{
-				/* If we found the task in the reader list */
-				free(l);
+#ifdef STARPU_DEBUG
+			/* Make sure we are removing ourself from the proper handle */
+			struct _starpu_task_wrapper_dlist *l;
+			for (l = task_dependency_slot->prev; l->task; l = l->prev)
+				;
+			STARPU_ASSERT(l == &handle->last_submitted_accessors);
+			for (l = task_dependency_slot->next; l->task; l = l->next)
+				;
+			STARPU_ASSERT(l == &handle->last_submitted_accessors);
+#endif
 
+			task_dependency_slot->next->prev = task_dependency_slot->prev;
+			task_dependency_slot->prev->next = task_dependency_slot->next;
 #ifndef STARPU_USE_FXT
-				if (_starpu_bound_recording)
+			if (_starpu_bound_recording)
 #endif
-				{
-					/* Save the job id of the reader task in the ghost reader linked list list */
-					struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
-					struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
-					STARPU_ASSERT(link);
-					link->next = handle->last_submitted_ghost_accessors_id;
-					link->id = ghost_reader_job->job_id;
-					handle->last_submitted_ghost_accessors_id = link;
-				}
-
-				if (prev)
-				{
-					prev->next = next;
-				}
-				else
-				{
-					/* This is the first element of the list */
-					handle->last_submitted_accessors = next;
-				}
-
-				/* XXX can we really find the same task again
-				 * once we have found it ? Otherwise, we should
-				 * avoid going through the entire list and stop
-				 * as soon as we find the task. TODO: check how
-				 * duplicate dependencies are treated. */
-			}
-			else
 			{
-				prev = l;
+				/* Save the job id of the reader task in the ghost reader linked list list */
+				struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
+				struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
+				STARPU_ASSERT(link);
+				link->next = handle->last_submitted_ghost_accessors_id;
+				link->id = ghost_reader_job->job_id;
+				handle->last_submitted_ghost_accessors_id = link;
 			}
-
-			l = next;
 		}
 	}
 
@@ -434,13 +413,22 @@ void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
         struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
+	struct _starpu_task_wrapper_dlist *slots = _STARPU_JOB_GET_DEP_SLOTS(j);
 
 	if (!task->cl)
 		return;
 
-        unsigned nbuffers = task->cl->nbuffers;
-
+        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
+
+	/* Release all implicit dependencies */
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
+
+		_starpu_release_data_enforce_sequential_consistency(task, &slots[index], handle);
+	}
+
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = descrs[index].handle;
@@ -451,7 +439,6 @@ void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 			 * _starpu_compar_handles */
 			continue;
 
-		_starpu_release_data_enforce_sequential_consistency(task, handle);
 		/* Release the reference acquired in _starpu_push_task_output */
 		_starpu_spin_lock(&handle->header_lock);
 		STARPU_ASSERT(handle->busy_count > 0);
@@ -512,7 +499,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 		while (link)
 		{
 			/* There is no need to depend on that task now, since it was already unlocked */
-			_starpu_release_data_enforce_sequential_consistency(link->task, handle);
+			_starpu_release_data_enforce_sequential_consistency(link->task, &_starpu_get_job_associated_to_task(link->task)->implicit_dep_slot, handle);
 
 			int ret = _starpu_task_submit_internally(link->task);
 			STARPU_ASSERT(!ret);
@@ -540,7 +527,7 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */
-		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, &_starpu_get_job_associated_to_task(sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task)

+ 2 - 2
src/core/dependencies/implicit_data_deps.h

@@ -21,10 +21,10 @@
 #include <starpu.h>
 #include <common/config.h>
 
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 						   starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, struct _starpu_task_wrapper_dlist *task_dependency_slot, starpu_data_handle_t handle);
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);

+ 15 - 6
src/core/jobs.c

@@ -53,7 +53,10 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 	memset(job, 0, sizeof(*job));
 
 	if (task->dyn_handles)
-	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(job->dyn_ordered_buffers[0]));
+	{
+	     job->dyn_ordered_buffers = malloc(STARPU_TASK_GET_NBUFFERS(task) * sizeof(job->dyn_ordered_buffers[0]));
+	     job->dyn_dep_slots = malloc(STARPU_TASK_GET_NBUFFERS(task) * sizeof(job->dyn_dep_slots[0]));
+	}
 
 	job->task = task;
 
@@ -109,8 +112,13 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	_starpu_cg_list_deinit(&j->job_successors);
 	if (j->dyn_ordered_buffers)
 	{
-	     free(j->dyn_ordered_buffers);
-	     j->dyn_ordered_buffers = NULL;
+		free(j->dyn_ordered_buffers);
+		j->dyn_ordered_buffers = NULL;
+	}
+	if (j->dyn_dep_slots)
+	{
+		free(j->dyn_dep_slots);
+		j->dyn_dep_slots = NULL;
 	}
 
 	_starpu_job_delete(j);
@@ -223,8 +231,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	if (task->cl && !continuation)
 	{
 		unsigned i;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 #ifdef STARPU_USE_SC_HYPERVISOR
-		for(i = 0; i < task->cl->nbuffers; i++)
+		for(i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			if (handle != NULL)
@@ -232,7 +241,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		}
 #endif //STARPU_USE_SC_HYPERVISOR
 
-		for (i = 0; i < task->cl->nbuffers; i++)
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
@@ -259,7 +268,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	if (j->implicit_dep_handle && !continuation)
 	{
 		starpu_data_handle_t handle = j->implicit_dep_handle;
-		_starpu_release_data_enforce_sequential_consistency(j->task, handle);
+		_starpu_release_data_enforce_sequential_consistency(j->task, &j->implicit_dep_slot, handle);
 		/* Release reference taken while setting implicit_dep_handle */
 		_starpu_spin_lock(&handle->header_lock);
 		handle->busy_count--;

+ 5 - 0
src/core/jobs.h

@@ -78,7 +78,9 @@ LIST_TYPE(_starpu_job,
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
 	struct _starpu_data_descr ordered_buffers[STARPU_NMAXBUFS];
+	struct _starpu_task_wrapper_dlist dep_slots[STARPU_NMAXBUFS];
 	struct _starpu_data_descr *dyn_ordered_buffers;
+	struct _starpu_task_wrapper_dlist *dyn_dep_slots;
 
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
@@ -92,6 +94,7 @@ LIST_TYPE(_starpu_job,
 	 * the handle for this dependency, so as to remove the task from the
 	 * last_writer/readers */
 	starpu_data_handle_t implicit_dep_handle;
+	struct _starpu_task_wrapper_dlist implicit_dep_slot;
 
 	/* Indicates whether the task associated to that job has already been
 	 * submitted to StarPU (1) or not (0) (using starpu_task_submit).
@@ -245,4 +248,6 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 #define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
 #define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers
 
+#define _STARPU_JOB_GET_DEP_SLOTS(job) (((job)->dyn_dep_slots) ? (job)->dyn_dep_slots : (job)->dep_slots)
+
 #endif // __JOBS_H__

+ 14 - 30
src/core/perfmodel/perfmodel.c

@@ -70,19 +70,13 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 
 static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
-	double exp = NAN;
 	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
-	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
 	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
-	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
 
-	if (per_arch_cost_function)
-		exp = per_arch_cost_function(task, arch, nimpl);
-	else if (per_arch_cost_model)
-		exp = per_arch_cost_model(task->buffers);
+	STARPU_ASSERT_MSG(per_arch_cost_function, "STARPU_PER_ARCH needs per-arch cost_function to be defined");
 
-	return exp;
+	return per_arch_cost_function(task, arch, nimpl);
 }
 
 /*
@@ -118,26 +112,14 @@ static double common_task_expected_perf(struct starpu_perfmodel *model, struct s
 	double exp;
 	double alpha;
 
-	if (model->cost_function)
-	{
-		exp = model->cost_function(task, nimpl);
-		alpha = starpu_worker_get_relative_speedup(arch);
-
-		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
+	STARPU_ASSERT_MSG(model->cost_function, "STARPU_COMMON requires common cost_function to be defined");
 
-		return (exp/alpha);
-	}
-	else if (model->cost_model)
-	{
-		exp = model->cost_model(task->buffers);
-		alpha = starpu_worker_get_relative_speedup(arch);
+	exp = model->cost_function(task, nimpl);
+	alpha = starpu_worker_get_relative_speedup(arch);
 
-		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
+	STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
 
-		return (exp/alpha);
-	}
-
-	return NAN;
+	return (exp/alpha);
 }
 
 void _starpu_load_perfmodel(struct starpu_perfmodel *model)
@@ -226,8 +208,9 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	unsigned i;
 	double sum = 0.0;
 	enum starpu_node_kind node_kind;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-	for (i = 0; i < task->cl->nbuffers; i++)
+	for (i = 0; i < nbuffers; i++)
 	{
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
@@ -304,7 +287,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 /* Data transfer performance modeling */
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task)
 {
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned buffer;
 
 	double penalty = 0.0;
@@ -312,7 +295,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
@@ -397,10 +380,11 @@ double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundl
 		if (task->cl)
 		{
 			unsigned b;
-			for (b = 0; b < task->cl->nbuffers; b++)
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+			for (b = 0; b < nbuffers; b++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, b);
-				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, b);
+				enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, b);
 
 				if (!(mode & STARPU_R))
 					continue;

+ 26 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -783,6 +783,8 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
+	_starpu_frdlock(f);
+
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	ncpus = _starpu_topology_get_nhwcpu(config);
         unsigned gpu;
@@ -835,6 +837,7 @@ static void load_bus_affinity_file_content(void)
 		STARPU_ASSERT(ret == 0);
 	}
 #endif /* !STARPU_USE_OPENCL */
+	_starpu_frdunlock(f);
 
 	fclose(f);
 #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
@@ -862,6 +865,7 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 	}
 
+	_starpu_frdlock(f);
 	unsigned cpu;
         unsigned gpu;
 
@@ -897,6 +901,7 @@ static void write_bus_affinity_file_content(void)
 	}
 #endif
 
+	_starpu_frdunlock(f);
 	fclose(f);
 #endif
 }
@@ -1006,6 +1011,7 @@ static int load_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1073,13 +1079,14 @@ static int load_bus_latency_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1104,6 +1111,8 @@ static void write_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1163,6 +1172,7 @@ static void write_bus_latency_file_content(void)
 
 		fprintf(f, "\n");
 	}
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -1223,6 +1233,7 @@ static int load_bus_bandwidth_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1290,13 +1301,14 @@ static int load_bus_bandwidth_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1316,6 +1328,9 @@ static void write_bus_bandwidth_file_content(void)
 	f = fopen(path, "w+");
 	STARPU_ASSERT(f);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
+
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		fprintf(f, "to %d\t\t", dst);
@@ -1387,6 +1402,7 @@ static void write_bus_bandwidth_file_content(void)
 		fprintf(f, "\n");
 	}
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 #endif /* STARPU_SIMGRID */
@@ -1551,6 +1567,7 @@ static void check_bus_config_file(void)
                 // Loading configuration from file
                 f = fopen(path, "r");
                 STARPU_ASSERT(f);
+		_starpu_frdlock(f);
                 _starpu_drop_comments(f);
                 ret = fscanf(f, "%u\t", &read_cpus);
 		STARPU_ASSERT(ret == 1);
@@ -1565,6 +1582,7 @@ static void check_bus_config_file(void)
 		if (ret == 0)
 			read_mic = 0;
                 _starpu_drop_comments(f);
+		_starpu_frdunlock(f);
                 fclose(f);
 
                 // Loading current configuration
@@ -1619,6 +1637,8 @@ static void write_bus_config_file_content(void)
 
         f = fopen(path, "w+");
 	STARPU_ASSERT(f);
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
         fprintf(f, "# Current configuration\n");
         fprintf(f, "%u # Number of CPUs\n", ncpus);
@@ -1626,6 +1646,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%d # Number of MIC devices\n", nmic);
 
+	_starpu_fwrunlock(f);
         fclose(f);
 }
 
@@ -1664,6 +1685,8 @@ static void write_bus_platform_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f,
 "<?xml version='1.0'?>\n"
@@ -1810,6 +1833,7 @@ static void write_bus_platform_file_content(void)
 " </platform>\n"
 		);
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 

+ 14 - 4
src/core/perfmodel/perfmodel_history.c

@@ -66,7 +66,7 @@ size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_p
 	}
 	else
 	{
-		unsigned nbuffers = task->cl->nbuffers;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 		size_t size = 0;
 
 		unsigned buffer;
@@ -653,10 +653,10 @@ static void initialize_model_with_file(FILE*f, struct starpu_perfmodel *model)
 
 void starpu_perfmodel_init(struct starpu_perfmodel *model)
 {
-	STARPU_ASSERT(model && model->symbol);
-
 	int already_init;
 
+	STARPU_ASSERT(model);
+
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
 	already_init = model->is_init;
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
@@ -834,7 +834,10 @@ static void save_history_based_model(struct starpu_perfmodel *model)
 	f = fopen(path, "w+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 	dump_model_file(f, model);
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -1009,7 +1012,9 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 				f = fopen(path, "r");
 				STARPU_ASSERT(f);
 
+				_starpu_frdlock(f);
 				parse_model_file(f, model, scan_history);
+				_starpu_frdunlock(f);
 
 				fclose(f);
 			}
@@ -1099,10 +1104,12 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 	FILE *f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
+	_starpu_frdlock(f);
 	starpu_perfmodel_init_with_file(f, model);
 	rewind(f);
 
 	parse_model_file(f, model, 1);
+	_starpu_frdunlock(f);
 
 	STARPU_ASSERT(fclose(f) == 0);
 
@@ -1412,6 +1419,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			_STARPU_DISP("Error <%s> when opening file <%s>\n", strerror(errno), per_arch_model->debug_path);
 			STARPU_ABORT();
 		}
+		_starpu_fwrlock(f);
 
 		if (!j->footprint_is_computed)
 			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
@@ -1420,8 +1428,9 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 		fprintf(f, "0x%x\t%lu\t%f\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(model, arch, nimpl, j), measured, task->predicted, task->predicted_transfer, cpuid);
 		unsigned i;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-		for (i = 0; i < task->cl->nbuffers; i++)
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 
@@ -1430,6 +1439,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			handle->ops->display(handle, f);
 		}
 		fprintf(f, "\n");
+		_starpu_fwrunlock(f);
 		fclose(f);
 #endif
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);

+ 32 - 11
src/core/sched_ctx.c

@@ -23,13 +23,13 @@ starpu_pthread_rwlock_t changing_ctx_mutex[STARPU_NMAX_SCHED_CTXS];
 
 static starpu_pthread_mutex_t sched_ctx_manag = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t finished_submit_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
+static struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 starpu_pthread_key_t sched_ctx_key;
-unsigned with_hypervisor = 0;
-double hyp_start_sample[STARPU_NMAX_SCHED_CTXS];
-double hyp_start_allow_sample[STARPU_NMAX_SCHED_CTXS];
-double flops[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
-size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
+static unsigned with_hypervisor = 0;
+static double hyp_start_sample[STARPU_NMAX_SCHED_CTXS];
+static double hyp_start_allow_sample[STARPU_NMAX_SCHED_CTXS];
+static double flops[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
+static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
@@ -47,7 +47,7 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 		worker->nsched_ctxs++;
 	}
 	worker->removed_from_ctx[sched_ctx_id] = 0;
-	if(worker->tmp_sched_ctx == sched_ctx_id)
+	if(worker->tmp_sched_ctx == (int) sched_ctx_id)
 		worker->tmp_sched_ctx = -1;
 	return;
 }
@@ -1641,10 +1641,10 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
 }
 
-void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_flops)
 {
         _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
-        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, ready_flops);
 }
 
 void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
@@ -1788,7 +1788,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	int w = 0;
 
-	struct _starpu_worker *worker = NULL;
 	struct starpu_sched_ctx_iterator it;
 	int workerid;
 	if(workers->init_iterator)
@@ -1814,7 +1813,6 @@ static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *
 
 	int masters[nworkers];
 	int w;
-	struct _starpu_worker *worker = NULL;
 	for(w = 0; w < nworkers; w++)
 	{
 		int workerid = workerids[w];
@@ -1951,3 +1949,26 @@ void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
 	/* wake up starpu workers */
 	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
 }
+
+int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
+{
+	int idx = 0;
+	int curr_workerid = starpu_worker_get_id();
+	int worker;
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if(worker == curr_workerid)
+			return idx;
+		idx++;
+	}
+
+	return -1;
+}

+ 6 - 4
src/core/sched_policy.c

@@ -261,7 +261,8 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		unsigned node = starpu_worker_get_memory_node(workerid);
 		if (_starpu_task_uses_multiformat_handles(task))
 		{
-			for (i = 0; i < task->cl->nbuffers; i++)
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+			for (i = 0; i < nbuffers; i++)
 			{
 				struct starpu_task *conversion_task;
 				starpu_data_handle_t handle;
@@ -278,7 +279,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 				//_STARPU_DEBUG("Pushing a conversion task\n");
 			}
 
-			for (i = 0; i < task->cl->nbuffers; i++)
+			for (i = 0; i < nbuffers; i++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 				handle->mf_node = node;
@@ -607,7 +608,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		STARPU_ABORT();
 	}
 
-	STARPU_CODELET_SET_MODE(conversion_task->cl, STARPU_RW, 0);
+	STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0);
 	return conversion_task;
 }
 
@@ -830,7 +831,8 @@ pick:
 	 * required conversion tasks.
 	 */
 	unsigned i;
-	for (i = 0; i < task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i = 0; i < nbuffers; i++)
 	{
 		struct starpu_task *conversion_task;
 		starpu_data_handle_t handle;

+ 3 - 0
src/core/simgrid.c

@@ -217,6 +217,7 @@ void _starpu_simgrid_init()
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
 		in = fopen(path, "r");
+		_starpu_frdlock(in);
 		STARPU_ASSERT_MSG(in, "Could not open platform file %s", path);
 #ifdef HAVE_MKSTEMPS
 		out = mkstemps(template, strlen(".xml"));
@@ -230,6 +231,8 @@ void _starpu_simgrid_init()
 		snprintf(cmdline, sizeof(cmdline), "xsltproc --novalid --stringparam ASname %s -o %s "STARPU_DATADIR"/starpu/starpu_smpi.xslt %s", asname, template, path);
 		ret = system(cmdline);
 		STARPU_ASSERT_MSG(ret == 0, "running xsltproc to generate SMPI platforms %s from %s failed", template, path);
+		_starpu_frdunlock(in);
+		fclose(in);
 
 		/* And create it */
 		MSG_create_environment(template);

+ 21 - 30
src/core/task.c

@@ -294,12 +294,15 @@ int _starpu_submit_job(struct _starpu_job *j)
 		int i;
 		size_t data_size = 0;
 		if (j->task->cl)
-			for(i = 0; i < j->task->cl->nbuffers; i++)
+		{
+			unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(j->task);
+			for(i = 0; i < nbuffers; i++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 				if (handle != NULL)
 					data_size += _starpu_data_get_size(handle);
 			}
+		}
 
 		_STARPU_TRACE_HYPERVISOR_BEGIN();
 		sched_ctx->perf_counters->notify_submitted_job(j->task, j->footprint, data_size);
@@ -311,7 +314,8 @@ int _starpu_submit_job(struct _starpu_job *j)
 	if (task->cl && !continuation)
 	{
 		unsigned i;
-		for (i=0; i<task->cl->nbuffers; i++)
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+		for (i=0; i<nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
@@ -441,26 +445,9 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	}
 }
 
-void _starpu_task_check_deprecated_fields(struct starpu_task *task)
+void _starpu_task_check_deprecated_fields(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED)
 {
-	if (task->cl)
-	{
-		unsigned i;
-		for(i=0; i<STARPU_MIN(task->cl->nbuffers, STARPU_NMAXBUFS) ; i++)
-		{
-			if (task->buffers[i].handle && task->handles[i])
-			{
-				_STARPU_DISP("[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
-				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
-				STARPU_ABORT();
-			}
-			if (task->buffers[i].handle)
-			{
-				task->handles[i] = task->buffers[i].handle;
-				task->cl->modes[i] = task->buffers[i].mode;
-			}
-		}
-	}
+	/* None any more */
 }
 
 /* application should submit new tasks to StarPU through this function */
@@ -511,17 +498,18 @@ int starpu_task_submit(struct starpu_task *task)
 	if (task->cl)
 	{
 		unsigned i;
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 		/* Check buffers */
 		if (task->dyn_handles == NULL)
-			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+			STARPU_ASSERT_MSG(STARPU_TASK_GET_NBUFFERS(task) <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.", task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
 
 		if (task->dyn_handles)
 		{
-			task->dyn_interfaces = malloc(task->cl->nbuffers * sizeof(void *));
+			task->dyn_interfaces = malloc(nbuffers * sizeof(void *));
 		}
 
-		for (i = 0; i < task->cl->nbuffers; i++)
+		for (i = 0; i < nbuffers; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are not partitioned */
@@ -665,11 +653,12 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 	{
 		/* This would be done by data dependencies checking */
 		unsigned i;
-		for (i=0 ; i<task->cl->nbuffers ; i++)
+		unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+		for (i=0 ; i<nbuffers ; i++)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+			enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 			int node = -1;
 			if (j->task->cl->specific_nodes)
@@ -704,7 +693,8 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	/* We retain handle reference count */
 	unsigned i;
-	for (i=0; i<task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i=0; i<nbuffers; i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 		_starpu_spin_lock(&handle->header_lock);
@@ -729,11 +719,11 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	j->submitted = 1;
 	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops);
-	for (i=0 ; i<task->cl->nbuffers ; i++)
+	for (i=0 ; i<nbuffers ; i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		int node = -1;
 		if (j->task->cl->specific_nodes)
@@ -989,7 +979,8 @@ int
 _starpu_task_uses_multiformat_handles(struct starpu_task *task)
 {
 	unsigned i;
-	for (i = 0; i < task->cl->nbuffers; i++)
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	for (i = 0; i < nbuffers; i++)
 	{
 		if (_starpu_data_is_multiformat_handle(STARPU_TASK_GET_HANDLE(task, i)))
 			return 1;

+ 5 - 5
src/datawizard/coherency.c

@@ -679,13 +679,13 @@ static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handl
 
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 {
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
@@ -718,7 +718,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
 	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
 
@@ -762,7 +762,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
 		int node = descrs[index].node;
 		if (node == -1)
 			node = local_memory_node;
@@ -828,7 +828,7 @@ void _starpu_push_task_output(struct _starpu_job *j)
 		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
         struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
-        unsigned nbuffers = task->cl->nbuffers;
+        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 	int workerid = starpu_worker_get_id();
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();

+ 8 - 1
src/datawizard/coherency.h

@@ -98,6 +98,13 @@ struct _starpu_task_wrapper_list
 	struct _starpu_task_wrapper_list *next;
 };
 
+/* This structure describes a doubly-linked list of task */
+struct _starpu_task_wrapper_dlist {
+	struct starpu_task *task;
+	struct _starpu_task_wrapper_dlist *next;
+	struct _starpu_task_wrapper_dlist *prev;
+};
+
 extern int _starpu_has_not_important_data;
 
 typedef void (*_starpu_data_handle_unregister_hook)(starpu_data_handle_t);
@@ -170,7 +177,7 @@ struct _starpu_data_state
 	 * sequential_consistency flag is enabled. */
 	enum starpu_data_access_mode last_submitted_mode;
 	struct starpu_task *last_sync_task;
-	struct _starpu_task_wrapper_list *last_submitted_accessors;
+	struct _starpu_task_wrapper_dlist last_submitted_accessors;
 
 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
 	 * say the dependencies that are not needed anymore, but that should

+ 3 - 1
src/datawizard/filters.c

@@ -190,7 +190,9 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		child->last_submitted_mode = STARPU_R;
 		child->last_sync_task = NULL;
-		child->last_submitted_accessors = NULL;
+		child->last_submitted_accessors.task = NULL;
+		child->last_submitted_accessors.next = &child->last_submitted_accessors;
+		child->last_submitted_accessors.prev = &child->last_submitted_accessors;
 		child->post_sync_tasks = NULL;
 		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
 		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);

+ 2 - 1
src/datawizard/footprint.c

@@ -24,8 +24,9 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task)
 {
 	uint32_t footprint = 0;
 	unsigned buffer;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
-	for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
+	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 

+ 2 - 0
src/datawizard/interfaces/block_interface.c

@@ -439,6 +439,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -509,6 +510,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz 2D transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{

+ 3 - 1
src/datawizard/interfaces/data_interface.c

@@ -291,7 +291,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
 	handle->last_submitted_mode = STARPU_R;
 	handle->last_sync_task = NULL;
-	handle->last_submitted_accessors = NULL;
+	handle->last_submitted_accessors.task = NULL;
+	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 	handle->post_sync_tasks = NULL;
 
 	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */

+ 7 - 119
src/datawizard/interfaces/matrix_interface.c

@@ -27,19 +27,14 @@
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/mic/driver_mic_source.h>
 
-/* If you can promise that there is no stride in your matrices, you can define this */
-// #define NO_STRIDE
-
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
@@ -58,17 +53,13 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#endif
 #else
 #ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
 	/* Enable GPU-GPU transfers in simgrid */
 	.cuda_to_cuda_async = 1,
 #endif
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
@@ -379,29 +370,6 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	size_t elemsize = src_matrix->elemsize;
 	cudaError_t cures;
 
-#if 0
-	struct cudaMemcpy3DParms p;
-	memset(&p, 0, sizeof(p));
-
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * elemsize, dst_matrix->ny);
-	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
-	p.kind = kind;
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy3DAsync(&p, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpy3D(&p);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#else
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -422,17 +390,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		if (ret == -EAGAIN) return ret;
 		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
 	}
-#endif
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
 }
 
-/* XXX this is broken : We need to properly call cudaDeviceEnablePeerAccess(), and avoid crossing NUMA nodes... */
-#ifdef NO_STRIDE
 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	struct starpu_matrix_interface *src_matrix = src_interface;
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
 
@@ -442,70 +408,15 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	int src_dev = _starpu_memory_node_get_devid(src_node);
 	int dst_dev = _starpu_memory_node_get_devid(dst_node);
 
-
-#if 0
-	/* That code is not even working!! */
-	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
-
-	starpu_cuda_set_device(src_dev);
-
-	struct cudaPitchedPtr mem_device1;
-	cures = cudaMalloc3D(&mem_device1, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	starpu_cuda_set_device(dst_dev);
-
-	struct cudaPitchedPtr mem_device2;
-	cures = cudaMalloc3D(&mem_device2, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	struct cudaMemcpy3DPeerParms p;
-	memset(&p, 0, sizeof(p));
-	p.srcDevice = src_dev;
-	p.dstDevice = dst_dev;
-	p.srcPtr = mem_device1;
-	p.dstPtr = mem_device2;
-	p.extent = extent;
-
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-	cures = cudaMemcpy3DPeer(&p);
-	if (STARPU_UNLIKELY(cures))
-	        STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
-#if 0
 	struct cudaMemcpy3DPeerParms p;
 	memset(&p, 0, sizeof(p));
 
 	p.srcDevice = src_dev;
 	p.dstDevice = dst_dev;
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx * elemsize, dst_matrix->ny);
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx, dst_matrix->ny);
 	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
 
-#if 1
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-#endif
-
-	cures = cudaMemcpy3DPeerAsync(&p, stream);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-	cudaStreamSynchronize(stream);
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -519,30 +430,13 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-#else
-	/* XXX FIXME !!*/
-	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
-	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
-}
+#else
+	STARPU_ABORT_MSG("CUDA memcpy peer not available, but core triggered one ?!");
 #endif
+}
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
@@ -559,11 +453,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	if (src_node == dst_node)
 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
 	else
-	{
-		/* XXX not implemented */
-		STARPU_ABORT();
-		return 0;
-	}
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 0, 0);
 }
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
@@ -576,7 +466,6 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	if (src_node == dst_node)
@@ -584,7 +473,6 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU
 	else
 		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
 }
-#endif
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 1 - 1
src/datawizard/interfaces/void_interface.c

@@ -140,7 +140,7 @@ static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
 	return 0;
 }
 
-static ssize_t describe(void *data_interface, char *buf, size_t size)
+static ssize_t describe(void *data_interface STARPU_ATTRIBUTE_UNUSED, char *buf, size_t size)
 {
 	return snprintf(buf, size, "0");
 }

+ 3 - 1
src/datawizard/memory_nodes.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,8 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 		prefix = "SCC_shared";
 		break;
 	case STARPU_UNUSED:
+	default:
+		prefix = "unknown";
 		STARPU_ASSERT(0);
 	}
 	snprintf(name, size, "%s %u\n", prefix, descr.devid[node]);

+ 2 - 2
src/datawizard/user_interactions.c

@@ -157,7 +157,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t h
 		wrapper->post_sync_task->name = "acquire_cb_post";
 		wrapper->post_sync_task->detach = 1;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, &_starpu_get_job_associated_to_task(wrapper->post_sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task)
@@ -277,7 +277,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 		wrapper.post_sync_task->name = "acquire_post";
 		wrapper.post_sync_task->detach = 1;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, &_starpu_get_job_associated_to_task(wrapper.post_sync_task)->implicit_dep_slot, handle, mode);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
 		{

+ 11 - 7
src/debug/traces/starpu_fxt.c

@@ -115,7 +115,9 @@ static double last_codelet_start[STARPU_NMAXWORKERS];
 static char last_codelet_symbol[STARPU_NMAXWORKERS][4*sizeof(unsigned long)];
 static int last_codelet_parameter[STARPU_NMAXWORKERS];
 #define MAX_PARAMETERS 8
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
 static char last_codelet_parameter_description[STARPU_NMAXWORKERS][MAX_PARAMETERS][FXT_MAX_PARAMS*sizeof(unsigned long)];
+#endif
 
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -321,6 +323,7 @@ static void thread_pop_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
 static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag)
 {
 #ifdef STARPU_HAVE_POTI
@@ -332,6 +335,7 @@ static void worker_set_detailed_state(double time, const char *prefix, long unsi
 	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx\n", time, prefix, workerid, name, size, parameters, footprint, tag);
 #endif
 }
+#endif
 
 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
 {
@@ -2217,7 +2221,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 	{
 		unsigned inputfile;
 
-		uint64_t offsets[64];
+		uint64_t offsets[options->ninputfiles];
 
 		/*
 		 * Find the trace offsets:
@@ -2230,11 +2234,11 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 		 *	- psi_k(x) = x - offset_k
 		 */
 
-		int unique_keys[64];
-		int rank_k[64];
-		uint64_t start_k[64];
-		uint64_t sync_k[64];
-		unsigned sync_k_exists[64];
+		int unique_keys[options->ninputfiles];
+		int rank_k[options->ninputfiles];
+		uint64_t start_k[options->ninputfiles];
+		uint64_t sync_k[options->ninputfiles];
+		unsigned sync_k_exists[options->ninputfiles];
 		uint64_t M = 0;
 
 		unsigned found_one_sync_point = 0;
@@ -2305,7 +2309,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 #endif
 
 			char file_prefix[32];
-			snprintf(file_prefix, 32, "%d_", filerank);
+			snprintf(file_prefix, sizeof(file_prefix), "%d_", filerank);
 
 			options->file_prefix = file_prefix;
 			options->file_offset = offsets[inputfile];

+ 46 - 47
src/drivers/cuda/driver_cuda.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -372,7 +372,7 @@ void _starpu_init_cuda(void)
 	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
 }
 
-static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	int ret;
 
@@ -396,11 +396,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
-	starpu_cuda_set_device(args->devid);
+	starpu_cuda_set_device(worker->devid);
 #endif
 
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
@@ -410,7 +410,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 	{
 		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
-		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
+		_starpu_simgrid_execute_job(j, &worker->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
@@ -420,18 +420,18 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 	return 0;
 }
 
-static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	struct timespec codelet_end;
 
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	worker->current_task = NULL;
 
-	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &j->cl_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
@@ -441,18 +441,18 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 /* XXX Should this be merged with _starpu_init_cuda ? */
 int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
-	struct _starpu_worker *args = &worker_set->workers[0];
-	unsigned devid = args->devid;
+	struct _starpu_worker *worker = &worker_set->workers[0];
+	unsigned devid = worker->devid;
 	unsigned i;
 
-	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
+	_starpu_worker_start(worker, _STARPU_FUT_CUDA_KEY);
 
 #ifdef STARPU_USE_FXT
-	unsigned memnode = args->memory_node;
+	unsigned memnode = worker->memory_node;
 	for (i = 1; i < worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *worker = &worker_set->workers[i];
-		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, worker->workerid, devid, memnode);
+		struct _starpu_worker *_worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, _worker->workerid, devid, memnode);
 	}
 #endif
 
@@ -461,14 +461,14 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 #endif
 
 	_starpu_cuda_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(args->memory_node, _starpu_cuda_get_global_mem_size(devid));
+	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_cuda_get_global_mem_size(devid));
 
-	_starpu_malloc_init(args->memory_node);
+	_starpu_malloc_init(worker->memory_node);
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
 
-	args->status = STATUS_UNKNOWN;
+	worker->status = STATUS_UNKNOWN;
 
 	float size = (float) global_mem[devid] / (1<<30);
 #ifdef STARPU_SIMGRID
@@ -482,27 +482,26 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
 	if (props[devid].pciDomainID)
-		snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB %04x:%02x:%02x.0)", devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+		snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB %04x:%02x:%02x.0)", devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
 	else
 #endif
-		snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB %02x:%02x.0)", devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
+		snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB %02x:%02x.0)", devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-	snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB)", devid, devname, size);
+	snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB)", devid, devname, size);
 #endif
-	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", devid);
-	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+	snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u", devid);
+	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *worker = &worker_set->workers[i];
-		_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
+		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->worker_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
+	worker->worker_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
@@ -528,10 +527,10 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	idle = 0;
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *args = &worker_set->workers[i];
-		int workerid = args->workerid;
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		int workerid = worker->workerid;
 
-		task = args->current_task;
+		task = worker->current_task;
 
 		if (!task)
 		{
@@ -549,8 +548,8 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		else
 		{
 			/* Asynchronous task completed! */
-			_starpu_set_local_worker_key(args);
-			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+			_starpu_set_local_worker_key(worker);
+			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			idle++;
 #ifdef STARPU_USE_FXT
 			int k;
@@ -583,14 +582,14 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *args = &worker_set->workers[i];
-		int workerid = args->workerid;
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		int workerid = worker->workerid;
 
 		task = tasks[i];
 		if (!task)
 			continue;
 
-		_starpu_set_local_worker_key(args);
+		_starpu_set_local_worker_key(worker);
 
 		j = _starpu_get_job_associated_to_task(task);
 
@@ -603,7 +602,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		}
 
 		_STARPU_TRACE_END_PROGRESS(memnode);
-		res = start_job_on_cuda(j, args);
+		res = start_job_on_cuda(j, worker);
 
 		if (res)
 		{
@@ -644,7 +643,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 #if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
 			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
 #endif
-			finish_job_on_cuda(j, args);
+			finish_job_on_cuda(j, worker);
 		}
 		_STARPU_TRACE_START_PROGRESS(memnode);
 	}
@@ -654,8 +653,8 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
 int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 {
-	struct _starpu_worker *args = &arg->workers[0];
-	unsigned memnode = args->memory_node;
+	struct _starpu_worker *worker = &arg->workers[0];
+	unsigned memnode = worker->memory_node;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
@@ -676,16 +675,16 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 	return 0;
 }
 
-void *_starpu_cuda_worker(void *arg)
+void *_starpu_cuda_worker(void *_arg)
 {
-	struct _starpu_worker_set* args = arg;
+	struct _starpu_worker_set* worker = _arg;
 
-	_starpu_cuda_driver_init(args);
+	_starpu_cuda_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
-		_starpu_cuda_driver_run_once(args);
+		_starpu_cuda_driver_run_once(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
-	_starpu_cuda_driver_deinit(args);
+	_starpu_cuda_driver_deinit(worker);
 
 	return NULL;
 }

+ 30 - 31
src/drivers/driver_common/driver_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2014  Inria
  *
@@ -34,13 +34,13 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_profiling_task_info *profiling_info;
 	int starpu_top=_starpu_top_status_get();
-	int workerid = args->workerid;
+	int workerid = worker->workerid;
 	unsigned calibrate_model = 0;
 
 	if (cl->model && cl->model->benchmarking)
@@ -52,7 +52,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (j->task_size == 1)
 		_starpu_sched_pre_exec_hook(task);
 
-	args->status = STATUS_EXECUTING;
+	worker->status = STATUS_EXECUTING;
 	task->status = STARPU_TASK_RUNNING;
 
 	if (rank == 0)
@@ -77,13 +77,13 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 }
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
+void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_profiling_task_info *profiling_info = task->profiling_info;
 	int starpu_top=_starpu_top_status_get();
-	int workerid = args->workerid;
+	int workerid = worker->workerid;
 	unsigned calibrate_model = 0;
 
 	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
@@ -103,16 +103,16 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	if (starpu_top)
 		_starpu_top_task_ended(task,workerid,codelet_end);
 
-	args->status = STATUS_UNKNOWN;
+	worker->status = STATUS_UNKNOWN;
 }
-void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker,
 					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 {
 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
 	struct timespec measured_ts;
 	double measured;
-	int workerid = worker_args->workerid;
+	int workerid = worker->workerid;
 	struct starpu_codelet *cl = j->task->cl;
 	int calibrate_model = 0;
 	int updated = 0;
@@ -171,7 +171,6 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 			const unsigned do_update_time_model = 1;
 			const double time_consumed = measured;
 #endif
-
 			if (do_update_time_model)
 			{
 				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker_args->devid, time_consumed, j->nimpl);
@@ -255,12 +254,12 @@ static void _starpu_worker_set_status_wakeup(int workerid)
 }
 
 
-static void _starpu_exponential_backoff(struct _starpu_worker *args)
+static void _starpu_exponential_backoff(struct _starpu_worker *worker)
 {
-	int delay = args->spinning_backoff;
+	int delay = worker->spinning_backoff;
 	
-	if (args->spinning_backoff < BACKOFF_MAX)
-		args->spinning_backoff<<=1; 
+	if (worker->spinning_backoff < BACKOFF_MAX)
+		worker->spinning_backoff<<=1; 
 	
 	while(delay--)
 		STARPU_UYIELD();
@@ -269,9 +268,9 @@ static void _starpu_exponential_backoff(struct _starpu_worker *args)
 
 
 /* Workers may block when there is no work to do at all. */
-struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode)
+struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int workerid, unsigned memnode)
 {
-	STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	struct starpu_task *task;
 	unsigned needed = 1;
 	_starpu_worker_set_status_scheduling(workerid);
@@ -279,7 +278,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 	{
 		struct _starpu_sched_ctx *sched_ctx = NULL;
 		struct _starpu_sched_ctx_list *l = NULL;
-		for (l = args->sched_ctx_list; l; l = l->next)
+		for (l = worker->sched_ctx_list; l; l = l->next)
 		{
 			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
@@ -290,13 +289,13 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 					/* don't let the worker sleep with the sched_mutex taken */
 					/* we need it until here bc of the list of ctxs of the workers
 					   that can change in another thread */
-					STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 					needed = 0;
 					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
 					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
 					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 					sched_ctx->parallel_sect[workerid] = 0;
-					STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 				}
 				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			}
@@ -304,19 +303,19 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 				break;
 		}
 		/* don't worry if the value is not correct (no lock) it will do it next time */
-		if(args->tmp_sched_ctx != -1)
+		if(worker->tmp_sched_ctx != -1)
 		{
-			sched_ctx = _starpu_get_sched_ctx_struct(args->tmp_sched_ctx);
+			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
 			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			if(sched_ctx->parallel_sect[workerid])
 			{
 //				needed = 0;
-				STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
 				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
 				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 				sched_ctx->parallel_sect[workerid] = 0;
-				STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+				STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 			}
 			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 		}
@@ -324,7 +323,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		needed = !needed;
 	}
 
-	task = _starpu_pop_task(args);
+	task = _starpu_pop_task(worker);
 
 	if (task == NULL)
 	{
@@ -335,17 +334,17 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 		_starpu_worker_set_status_sleeping(workerid);
 
-		if (_starpu_worker_can_block(memnode) && !_starpu_sched_ctx_last_worker_awake(args))
+		if (_starpu_worker_can_block(memnode) && !_starpu_sched_ctx_last_worker_awake(worker))
 		{
-			STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 		}
 		else
 		{
-			STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);			
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 			if (_starpu_machine_is_running())
 			{
-				_starpu_exponential_backoff(args);
+				_starpu_exponential_backoff(worker);
 #ifdef STARPU_SIMGRID
 				static int warned;
 				if (!warned)
@@ -364,9 +363,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 	_starpu_worker_set_status_scheduling_done(workerid);
 
 	_starpu_worker_set_status_wakeup(workerid);
-	args->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = BACKOFF_MIN;
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 
 
 #ifdef HAVE_AYUDAME_H

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -99,10 +99,10 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	}
 
 	/* count the number of in/inout/out buffers */
-	unsigned nbuffers = cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		switch (mode)
 		{
@@ -122,7 +122,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		unsigned gordon_buffer;
-		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
 
 		switch (mode)
 		{

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -429,7 +429,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 	_starpu_src_common_execute_kernel(node, kernel, worker->devid, task->cl->type,
 			(j->task_size > 1),
 			j->combined_workerid, task->handles,
-			task->interfaces, task->cl->nbuffers,
+			task->interfaces, STARPU_TASK_GET_NBUFFERS(task),
 			task->cl_arg, task->cl_arg_size);
 
 

+ 44 - 46
src/drivers/opencl/driver_opencl.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -562,29 +562,28 @@ void _starpu_opencl_init(void)
 #ifndef STARPU_SIMGRID
 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
 #endif
-static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args);
-static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args);
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker);
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
 
-int _starpu_opencl_driver_init(struct _starpu_worker *args)
+int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 {
-	int devid = args->devid;
-	int workerid = args->workerid;
+	int devid = worker->devid;
 
-	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
+	_starpu_worker_start(worker, _STARPU_FUT_OPENCL_KEY);
 
 #ifndef STARPU_SIMGRID
 	_starpu_opencl_init_context(devid);
 #endif
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
 
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(args->memory_node, _starpu_opencl_get_global_mem_size(devid));
+	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
 
-	_starpu_malloc_init(args->memory_node);
+	_starpu_malloc_init(worker->memory_node);
 
-	args->status = STATUS_UNKNOWN;
+	worker->status = STATUS_UNKNOWN;
 	float size = (float) global_mem[devid] / (1<<30);
 
 #ifdef STARPU_SIMGRID
@@ -594,26 +593,26 @@ int _starpu_opencl_driver_init(struct _starpu_worker *args)
 	char devname[128];
 	_starpu_opencl_get_device_name(devid, devname, 128);
 #endif
-	snprintf(args->name, sizeof(args->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
-	snprintf(args->short_name, sizeof(args->short_name), "OpenCL %u", devid);
+	snprintf(worker->name, sizeof(worker->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
+	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 
-	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END(workerid);
+	_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->worker_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
+	worker->worker_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
 
 	return 0;
 }
 
-int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
+int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 {
-	int workerid = args->workerid;
-	unsigned memnode = args->memory_node;
+	int workerid = worker->workerid;
+	unsigned memnode = worker->memory_node;
 
 	struct _starpu_job *j;
 	struct starpu_task *task;
@@ -629,7 +628,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		int err;
 		/* On-going asynchronous task, check for its termination first */
 
-		err = clGetEventInfo(task_events[args->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
+		err = clGetEventInfo(task_events[worker->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
 		STARPU_ASSERT(size == sizeof(cl_int));
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -643,7 +642,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
 		/* Asynchronous task completed! */
 		_STARPU_TRACE_END_EXECUTING();
-		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
+		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
 	}
 #endif /* STARPU_SIMGRID */
 
@@ -651,7 +650,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
-	task = _starpu_get_worker_task(args, workerid, memnode);
+	task = _starpu_get_worker_task(worker, workerid, memnode);
 
 	if (task == NULL)
 		return 0;
@@ -666,7 +665,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		return 0;
 	}
 
-	res = _starpu_opencl_start_job(j, args);
+	res = _starpu_opencl_start_job(j, worker);
 
 	if (res)
 	{
@@ -688,7 +687,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		/* Record event to synchronize with task termination later */
 		int err;
 		cl_command_queue queue;
-		starpu_opencl_get_queue(args->devid, &queue);
+		starpu_opencl_get_queue(worker->devid, &queue);
 		/* the function clEnqueueMarker is deprecated from
 		 * OpenCL version 1.2. We would like to use the new
 		 * function clEnqueueMarkerWithWaitList. We could do
@@ -698,7 +697,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		 * 2 macros detect the function availability in the
 		 * ICD and not in the device implementation.
 		 */
-		err = clEnqueueMarker(queue, &task_events[args->devid]);
+		err = clEnqueueMarker(queue, &task_events[worker->devid]);
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 		_STARPU_TRACE_START_EXECUTING();
 	}
@@ -710,18 +709,18 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 #endif
 	/* Synchronous execution */
 	{
-		_starpu_opencl_stop_job(j, args);
+		_starpu_opencl_stop_job(j, worker);
 	}
 	_STARPU_TRACE_START_PROGRESS(memnode);
 
 	return 0;
 }
 
-int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
+int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 {
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	unsigned memnode = args->memory_node;
+	unsigned memnode = worker->memory_node;
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
 
@@ -733,7 +732,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
 	_starpu_malloc_shutdown(memnode);
 
 #ifndef STARPU_SIMGRID
-	unsigned devid   = args->devid;
+	unsigned devid   = worker->devid;
         _starpu_opencl_deinit_context(devid);
 #endif
 
@@ -742,15 +741,15 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
 	return 0;
 }
 
-void *_starpu_opencl_worker(void *arg)
+void *_starpu_opencl_worker(void *_arg)
 {
-	struct _starpu_worker* args = arg;
+	struct _starpu_worker* worker = _arg;
 
-	_starpu_opencl_driver_init(args);
+	_starpu_opencl_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
-		_starpu_opencl_driver_run_once(args);
-	_starpu_opencl_driver_deinit(args);
+		_starpu_opencl_driver_run_once(worker);
+	_starpu_opencl_driver_deinit(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	return NULL;
@@ -803,7 +802,7 @@ cl_device_type _starpu_opencl_get_device_type(int devid)
 }
 #endif /* STARPU_USE_OPENCL */
 
-static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args)
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	int ret;
 
@@ -817,7 +816,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 	STARPU_ASSERT(cl);
 
 	_starpu_set_current_task(j->task);
-	args->current_task = j->task;
+	worker->current_task = j->task;
 
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
@@ -828,7 +827,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
@@ -851,7 +850,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
 		length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
 	  #endif
-		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
+		_starpu_simgrid_execute_job(j, &worker->perf_arch, length);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
@@ -860,18 +859,17 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 	return 0;
 }
 
-static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args)
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	struct timespec codelet_end;
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	worker->current_task = NULL;
 
-	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
-					   &j->cl_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 

+ 1 - 1
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -82,7 +82,7 @@ static const float idle_power_maximum=10000.0;
 static int count_non_ready_buffers(struct starpu_task *task, unsigned node)
 {
 	int cnt = 0;
-	unsigned nbuffers = task->cl->nbuffers;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 	unsigned index;
 
 	for (index = 0; index < nbuffers; index++)

+ 6 - 6
src/sched_policies/locality_work_stealing_policy.c

@@ -288,15 +288,15 @@ static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nwork
 		for(;;)
 		{
 			neighbour = (struct starpu_tree*)it.value;
-			int workerids[STARPU_NMAXWORKERS];
-			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+			int neigh_workerids[STARPU_NMAXWORKERS];
+			int neigh_nworkers = _starpu_worker_get_workerids(neighbour->id, neigh_workerids);
 			int w;
-			for(w = 0; w < nworkers; w++)
+			for(w = 0; w < neigh_nworkers; w++)
 			{
-				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
+				if(!it.visited[neigh_workerids[w]] && workers->present[neigh_workerids[w]])
 				{
-					ws->proxlist[workerid][cnt++] = workerids[w];
-					it.visited[workerids[w]] = 1;
+					ws->proxlist[workerid][cnt++] = neigh_workerids[w];
+					it.visited[neigh_workerids[w]] = 1;
 				}
 			}
 			if(!workers->has_next(workers, &it))

+ 2 - 4
src/sched_policies/work_stealing_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012  INRIA
  *
@@ -258,9 +258,7 @@ static inline unsigned select_worker(unsigned sched_ctx_id)
 }
 
 
-#ifdef STARPU_DEVEL
-#warning TODO rewrite ... this will not scale at all now
-#endif
+/* Note: this is not scalable work stealing,  use lws instead */
 static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 {
 	struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

+ 10 - 5
src/util/starpu_task_insert.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,7 +29,7 @@ void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 
 	/* Compute the size */
 	va_start(varg_list, arg_buffer_size);
-	*arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
+	_starpu_task_insert_get_args_size(varg_list, NULL, arg_buffer_size);
 	va_end(varg_list);
 
 	va_start(varg_list, arg_buffer_size);
@@ -71,11 +71,12 @@ struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char*
 	void *arg_buffer = NULL;
 	va_list varg_list_copy;
 	size_t arg_buffer_size = 0;
+	unsigned nbuffers;
 
 	/* Compute the size */
 
 	va_copy(varg_list_copy, varg_list);
-	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list_copy);
+	_starpu_task_insert_get_args_size(varg_list_copy, &nbuffers, &arg_buffer_size);
 	va_end(varg_list_copy);
 
 	if (arg_buffer_size)
@@ -89,9 +90,13 @@ struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char*
 	task->name = task_name;
 	task->cl_arg_free = cl_arg_free;
 
-	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
+	if (cl && cl->nbuffers != STARPU_VARIABLE_NBUFFERS)
 	{
-		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+		STARPU_ASSERT_MSG(nbuffers == (unsigned) cl->nbuffers, "Incoherent number of buffers between cl (%d) and number of parameters (%u)", cl->nbuffers, nbuffers);
+	}
+	if (nbuffers > STARPU_NMAXBUFS)
+	{
+		task->dyn_handles = malloc(nbuffers * sizeof(starpu_data_handle_t));
 	}
 
 	va_copy(varg_list_copy, varg_list);

+ 17 - 6
src/util/starpu_task_insert_utils.c

@@ -41,12 +41,14 @@ void starpu_task_insert_callback_wrapper(void *_cl_arg_wrapper)
 		cl_arg_wrapper->callback_func(cl_arg_wrapper->callback_arg);
 }
 
-size_t _starpu_task_insert_get_arg_size(va_list varg_list)
+void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, size_t *cl_arg_size)
 {
 	int arg_type;
 	size_t arg_buffer_size;
+	unsigned n;
 
 	arg_buffer_size = 0;
+	n = 0;
 
 	arg_buffer_size += sizeof(char);
 
@@ -55,11 +57,13 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		if (arg_type & STARPU_R || arg_type & STARPU_W || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
 		{
 			(void)va_arg(varg_list, starpu_data_handle_t);
+			n++;
 		}
 		else if (arg_type==STARPU_DATA_ARRAY)
 		{
 			(void)va_arg(varg_list, starpu_data_handle_t*);
-			(void)va_arg(varg_list, int);
+			int nb_handles = va_arg(varg_list, int);
+			n += nb_handles;
 		}
 		else if (arg_type==STARPU_VALUE)
 		{
@@ -136,7 +140,10 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		}
 	}
 
-	return arg_buffer_size;
+	if (cl_arg_size)
+		*cl_arg_size = arg_buffer_size;
+	if (nbuffers)
+		*nbuffers = n;
 }
 
 int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list)
@@ -280,6 +287,8 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 
 	prologue_pop_cl_arg_wrapper->callback_func = NULL;
 
+	(*task)->cl = cl;
+
 	while((arg_type = va_arg(varg_list, int)) != 0)
 	{
 		if (arg_type & STARPU_R || arg_type & STARPU_W || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
@@ -292,7 +301,9 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 			STARPU_ASSERT(cl != NULL);
 
 			STARPU_TASK_SET_HANDLE((*task), handle, current_buffer);
-			if (STARPU_CODELET_GET_MODE(cl, current_buffer))
+			if (cl->nbuffers == STARPU_VARIABLE_NBUFFERS)
+				STARPU_TASK_SET_MODE(*task, mode, current_buffer);
+			else if (STARPU_CODELET_GET_MODE(cl, current_buffer))
 			{
 				STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(cl, current_buffer) == mode,
 						   "The codelet <%s> defines the access mode %d for the buffer %d which is different from the mode %d given to starpu_task_insert\n",
@@ -426,9 +437,9 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 		}
 	}
 
-	STARPU_ASSERT(cl == NULL || current_buffer == cl->nbuffers);
+	if (cl && cl->nbuffers == STARPU_VARIABLE_NBUFFERS)
+		(*task)->nbuffers = current_buffer;
 
-	(*task)->cl = cl;
 	(*task)->cl_arg = arg_buffer;
 	(*task)->cl_arg_size = arg_buffer_size;
 

+ 1 - 1
src/util/starpu_task_insert_utils.h

@@ -21,7 +21,7 @@
 #include <stdarg.h>
 #include <starpu.h>
 
-size_t _starpu_task_insert_get_arg_size(va_list varg_list);
+void _starpu_task_insert_get_args_size(va_list varg_list, unsigned *nbuffers, size_t *cl_arg_size);
 int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list);
 void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
 

+ 2 - 2
src/worker_collection/worker_list.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Université de Bordeaux 1
+ * Copyright (C) 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011-2013  INRIA
  *
@@ -173,7 +173,7 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 static void _init_workers(int *workerids)
 {
 	unsigned i;
-	int nworkers = starpu_worker_get_count();
+	unsigned nworkers = starpu_worker_get_count();
 	for(i = 0; i < nworkers; i++)
 		workerids[i] = -1;
 	return;

+ 14 - 18
src/worker_collection/worker_tree.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013  Université de Bordeaux 1
- * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012-2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011-2013  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,7 +29,7 @@ static unsigned tree_has_next(struct starpu_worker_collection *workers, struct s
 
 	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
 	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->present);
-	
+
 	if(!neighbour)
 	{
 		starpu_tree_reset_visited(tree, it->visited);
@@ -58,7 +58,7 @@ static unsigned tree_has_next(struct starpu_worker_collection *workers, struct s
 static int tree_get_next(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	int ret = -1;
-	
+
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
 	struct starpu_tree *neighbour = NULL;
 	if(it->possible_value)
@@ -68,10 +68,10 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 	}
 	else
 		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->present);
-	
+
 	STARPU_ASSERT_MSG(neighbour, "no element anymore");
-	
-	
+
+
 	int workerids[STARPU_NMAXWORKERS];
 	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
 	int w;
@@ -97,7 +97,7 @@ static unsigned tree_has_next_master(struct starpu_worker_collection *workers, s
 
 	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
 	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
-	
+
 	if(!neighbour)
 	{
 		starpu_tree_reset_visited(tree, it->visited);
@@ -126,7 +126,7 @@ static unsigned tree_has_next_master(struct starpu_worker_collection *workers, s
 static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	int ret = -1;
-	
+
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
 	struct starpu_tree *neighbour = NULL;
 	if(it->possible_value)
@@ -136,10 +136,10 @@ static int tree_get_next_master(struct starpu_worker_collection *workers, struct
 	}
 	else
 		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
-	
+
 	STARPU_ASSERT_MSG(neighbour, "no element anymore");
-	
-	
+
+
 	int workerids[STARPU_NMAXWORKERS];
 	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
 	int w;
@@ -160,23 +160,19 @@ static int tree_get_next_master(struct starpu_worker_collection *workers, struct
 
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 {
-	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
-
 	if(!workers->present[worker])
 	{
 		workers->present[worker] = 1;
 		workers->nworkers++;
 		return worker;
 	}
-	else 
+	else
 		return -1;
 }
 
 
 static int tree_remove(struct starpu_worker_collection *workers, int worker)
 {
-	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
-
 	if(workers->present[worker])
 	{
 		workers->present[worker] = 0;
@@ -184,7 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 		workers->nworkers--;
 		return worker;
 	}
-	else 
+	else
 		return -1;
 }
 
@@ -200,7 +196,7 @@ static void tree_init(struct starpu_worker_collection *workers)
 		workers->present[i] = 0;
 		workers->is_master[i] = 0;
 	}
-	
+
 	return;
 }
 

+ 1 - 2
tests/Makefile.am

@@ -108,7 +108,6 @@ XFAIL_TESTS=	errorcheck/invalid_blocking_calls
 
 noinst_PROGRAMS =				\
 	main/deprecated_func			\
-	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/run_driver              \
 	main/deploop                            \
@@ -199,6 +198,7 @@ noinst_PROGRAMS =				\
 	datawizard/partition_lazy		\
 	datawizard/gpu_register   		\
 	datawizard/gpu_ptr_register   		\
+	datawizard/variable_parameters		\
 	datawizard/wt_host			\
 	datawizard/wt_broadcast			\
 	datawizard/readonly			\
@@ -443,7 +443,6 @@ datawizard_specific_node_SOURCES +=			\
 endif
 
 main_deprecated_func_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
-main_deprecated_buffer_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations
 
 main_subgraph_repeat_SOURCES =		\
 	main/subgraph_repeat.c

+ 230 - 0
tests/datawizard/variable_parameters.c

@@ -0,0 +1,230 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <starpu.h>
+#include "../helper.h"
+
+static starpu_data_handle_t handle1, handle2, handle3, handle4;
+
+/*
+ *	Increment codelet
+ */
+
+#ifdef STARPU_USE_OPENCL
+/* dummy OpenCL implementation */
+static void increment_opencl_kernel(void *descr[], void *cl_arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[i]);
+		unsigned h_token;
+
+		cl_command_queue queue;
+		starpu_opencl_get_current_queue(&queue);
+
+		clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+		h_token++;
+		clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+		clFinish(queue);
+	}
+}
+#endif
+
+
+#ifdef STARPU_USE_CUDA
+static void increment_cuda_kernel(void *descr[], void *arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		unsigned host_token;
+
+		/* This is a dummy technique of course */
+		cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+		host_token++;
+
+		cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	}
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+static void increment_cpu_kernel(void *descr[], void *cl_arg)
+{
+	int num = starpu_task_get_current()->nbuffers;
+	int i;
+
+	for (i = 0; i < num; i++)
+	{
+		unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		*tokenptr = *tokenptr + 1;
+	}
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = STARPU_VARIABLE_NBUFFERS,
+};
+
+int main(int argc, char **argv)
+{
+	unsigned *pvar = NULL;
+	int ret;
+	unsigned var1 = 0, var2 = 0, var3 = 0, var4 = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle1, STARPU_MAIN_RAM, (uintptr_t)&var1, sizeof(unsigned));
+	starpu_variable_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)&var2, sizeof(unsigned));
+	starpu_variable_data_register(&handle3, STARPU_MAIN_RAM, (uintptr_t)&var3, sizeof(unsigned));
+	starpu_variable_data_register(&handle4, STARPU_MAIN_RAM, (uintptr_t)&var4, sizeof(unsigned));
+
+#ifdef STARPU_QUICK_CHECK
+	unsigned nloops = 4;
+#else
+	unsigned nloops = 16;
+#endif
+
+	unsigned loop;
+	unsigned t;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		for (t = 0; t <= 4; t++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			unsigned i;
+
+			task->cl = &increment_cl;
+			task->handles[0] = handle1;
+			task->handles[1] = handle2;
+			task->handles[2] = handle3;
+			task->handles[3] = handle4;
+			for (i = 0; i < t; i++)
+				task->modes[i] = STARPU_RW;
+			task->nbuffers = t;
+
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				STARPU_RW, handle3,
+				0);
+		starpu_task_insert(&increment_cl,
+				STARPU_RW, handle1,
+				STARPU_RW, handle2,
+				STARPU_RW, handle3,
+				STARPU_RW, handle4,
+				0);
+	}
+
+	ret = starpu_data_acquire(handle1, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var1 != 8*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var1, 8*nloops);
+		starpu_data_release(handle1);
+		goto err;
+	}
+	starpu_data_release(handle1);
+
+	ret = starpu_data_acquire(handle2, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var2 != 6*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var2, 6*nloops);
+		starpu_data_release(handle2);
+		goto err;
+	}
+	starpu_data_release(handle2);
+
+	ret = starpu_data_acquire(handle3, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var3 != 4*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var3, 4*nloops);
+		starpu_data_release(handle3);
+		goto err;
+	}
+	starpu_data_release(handle3);
+
+	ret = starpu_data_acquire(handle4, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	if (var4 != 2*nloops)
+	{
+		FPRINTF(stderr, "[end of loop] Value %u != Expected value %u\n", var4, 2*nloops);
+		starpu_data_release(handle4);
+		goto err;
+	}
+	starpu_data_release(handle4);
+
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
+	starpu_data_unregister(handle1);
+	starpu_data_unregister(handle2);
+	starpu_data_unregister(handle3);
+	starpu_data_unregister(handle4);
+	starpu_shutdown();
+	return EXIT_FAILURE;
+}

+ 0 - 178
tests/main/deprecated_buffer.c

@@ -1,178 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <config.h>
-#include <starpu.h>
-#include "../helper.h"
-
-void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
-{
-	int *valin = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	int *valout = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
-
-	*valout = *valin;
-}
-
-struct starpu_codelet cl_with_mode =
-{
-	.name = "with_mode",
-	.cpu_funcs = {cpu_codelet, NULL},
-	.cpu_funcs_name = {"cpu_codelet", NULL},
-	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_W},
-};
-
-struct starpu_codelet cl_without_mode =
-{
-	.name = "without_mode",
-	.cpu_funcs = {cpu_codelet, NULL},
-	.cpu_funcs_name = {"cpu_codelet", NULL},
-	.nbuffers = 2
-};
-
-static
-int submit_codelet_task_insert(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-
-	ret = starpu_task_insert(&cl,
-				 STARPU_R, handles0,
-				 STARPU_W, handles1,
-				 0);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-static
-int submit_codelet_with_buffers(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-	struct starpu_task *task;
-
-	task = starpu_task_create();
-	task->cl = &cl;
-	task->buffers[0].handle = handles0;
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = handles1;
-	task->buffers[1].mode = STARPU_W;
-
-	ret = starpu_task_submit(task);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-static
-int submit_codelet_with_handles(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1)
-{
-	int ret;
-	struct starpu_task *task;
-
-	task = starpu_task_create();
-	task->cl = &cl;
-	task->handles[0] = handles0;
-	task->handles[1] = handles1;
-
-	ret = starpu_task_submit(task);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-
-	starpu_task_wait_for_all();
-	return 0;
-}
-
-struct submit_task_func
-{
-	int (*func)(struct starpu_codelet cl, starpu_data_handle_t handles0, starpu_data_handle_t handles1);
-	char *name;
-};
-
-static
-int submit_codelet(struct starpu_codelet cl, struct submit_task_func func)
-{
-	int *x, *y;
-	starpu_malloc((void**)&x, sizeof(*x));
-	starpu_malloc((void**)&y, sizeof(*y));
-	*x = 42;
-	*y = 14;
-	starpu_data_handle_t handles[2];
-	int ret;
-
-	starpu_variable_data_register(&handles[0], STARPU_MAIN_RAM, (uintptr_t)x, sizeof(*x));
-	starpu_variable_data_register(&handles[1], STARPU_MAIN_RAM, (uintptr_t)y, sizeof(*y));
-
-	ret = func.func(cl, handles[0], handles[1]);
-	starpu_data_unregister(handles[0]);
-	starpu_data_unregister(handles[1]);
-
-	if (!ret)
-	{
-		FPRINTF(stderr, "%s when executing codelet <%s> with func <%s>\n", *x==*y?"success":"error", cl.name, func.name);
-		ret = (*x != *y);
-	}
-
-	starpu_free(x);
-	starpu_free(y);
-
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int ret;
-	struct submit_task_func task_insert = { .func = submit_codelet_task_insert, .name = "task_insert" };
-	struct submit_task_func with_buffers = { .func = submit_codelet_with_buffers, .name = "with_buffers" };
-	struct submit_task_func with_handles = { .func = submit_codelet_with_handles, .name = "with_handles" };
-
-	ret = starpu_initialize(NULL, &argc, &argv);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	ret = submit_codelet(cl_with_mode, task_insert);
-	if (ret == -ENODEV)
-	{
-		starpu_shutdown();
-		fprintf(stderr, "WARNING: No one can execute this task\n");
-		return STARPU_TEST_SKIPPED;
-	}
-
-	if (!ret)
-	{
-		ret = submit_codelet(cl_with_mode, with_buffers);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_with_mode, with_handles);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_without_mode, task_insert);
-	}
-	if (!ret)
-	{
-		ret = submit_codelet(cl_without_mode, with_buffers);
-	}
-	// We do not test the combination cl_without_mode with_handles as it is not expected to work
-
-	starpu_shutdown();
-
-	STARPU_RETURN(ret);
-}

+ 3 - 3
tests/main/restart.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -30,8 +30,8 @@
   #define N	10
 #endif
 
-struct timeval start;
-struct timeval end;
+static struct timeval start;
+static struct timeval end;
 
 int main(int argc, char **argv)
 {

+ 41 - 0
tests/main/subgraph_repeat_regenerate_tag.c

@@ -51,6 +51,9 @@ static unsigned niter = 16384;
 static struct starpu_task taskA, taskB, taskC, taskD;
 
 static unsigned loop_cnt = 0;
+static unsigned loop_cnt_A = 0;
+static unsigned loop_cnt_B = 0;
+static unsigned loop_cnt_C = 0;
 static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
@@ -78,6 +81,39 @@ static struct starpu_codelet dummy_codelet =
 	.nbuffers = 1
 };
 
+static void callback_task_A(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_A++;
+
+	if (loop_cnt_A == niter)
+	{
+		/* We are done */
+		taskA.regenerate = 0;
+	}
+}
+
+static void callback_task_B(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_B++;
+
+	if (loop_cnt_B == niter)
+	{
+		/* We are done */
+		taskB.regenerate = 0;
+	}
+}
+
+static void callback_task_C(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_C++;
+
+	if (loop_cnt_C == niter)
+	{
+		/* We are done */
+		taskC.regenerate = 0;
+	}
+}
+
 static void callback_task_D(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -127,6 +163,7 @@ int main(int argc, char **argv)
 	taskA.regenerate = 1; /* this task will be explicitely resubmitted if needed */
 	taskA.use_tag = 1;
 	taskA.tag_id = TAG_A;
+	taskA.callback_func = callback_task_A;
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
@@ -136,6 +173,7 @@ int main(int argc, char **argv)
 	taskB.regenerate = 1;
 	taskB.use_tag = 1;
 	taskB.tag_id = TAG_B;
+	taskB.callback_func = callback_task_B;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
@@ -145,6 +183,7 @@ int main(int argc, char **argv)
 	taskC.regenerate = 1;
 	taskC.use_tag = 1;
 	taskC.tag_id = TAG_C;
+	taskC.callback_func = callback_task_C;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
@@ -184,6 +223,8 @@ int main(int argc, char **argv)
 
 	starpu_free(check_cnt);
 
+	starpu_data_unregister(check_data);
+
 	starpu_shutdown();
 
 	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */

+ 28 - 0
tests/main/subgraph_repeat_tag.c

@@ -44,6 +44,8 @@ static unsigned niter = 16384;
 static struct starpu_task taskA, taskB, taskC, taskD;
 
 static unsigned loop_cnt = 0;
+static unsigned loop_cnt_B = 0;
+static unsigned loop_cnt_C = 0;
 static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
@@ -71,6 +73,28 @@ static struct starpu_codelet dummy_codelet =
 	.nbuffers = 1
 };
 
+static void callback_task_B(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_B++;
+
+	if (loop_cnt_B == niter)
+	{
+		/* We are done */
+		taskB.regenerate = 0;
+	}
+}
+
+static void callback_task_C(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_C++;
+
+	if (loop_cnt_C == niter)
+	{
+		/* We are done */
+		taskC.regenerate = 0;
+	}
+}
+
 static void callback_task_D(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -126,6 +150,7 @@ int main(int argc, char **argv)
 	taskB.cl_arg = &taskB;
 	taskB.cl_arg_size = sizeof(&taskB);
 	taskB.regenerate = 1;
+	taskB.callback_func = callback_task_B;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
@@ -133,6 +158,7 @@ int main(int argc, char **argv)
 	taskC.cl_arg = &taskC;
 	taskC.cl_arg_size = sizeof(&taskC);
 	taskC.regenerate = 1;
+	taskC.callback_func = callback_task_C;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
@@ -168,6 +194,8 @@ int main(int argc, char **argv)
 
 	starpu_free(check_cnt);
 
+	starpu_data_unregister(check_data);
+
 	starpu_shutdown();
 
 	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */

+ 2 - 2
tests/microbenchs/matrix_as_vector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -186,7 +186,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	matrix_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) matrix_codelet.cpu_funcs[0] = matrix_cpu_func;
 	if (where == STARPU_CUDA) matrix_codelet.cuda_funcs[0] = matrix_cuda_func;
-	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
+	if (where == STARPU_CUDA) matrix_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) matrix_codelet.opencl_funcs[0] = matrix_opencl_func;
 
 	for(nx=NX_MIN ; nx<=NX_MAX ; nx*=2)

+ 4 - 4
tools/starpu_fxt_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,7 +37,7 @@ static uint64_t transfers[16][16];
 
 #define PROGNAME "starpu_fxt_stat"
 
-static void usage(char **argv)
+static void usage()
 {
 	fprintf(stderr, "Parse the log generated by FxT\n\n");
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
@@ -73,7 +73,7 @@ static void parse_args(int argc, char **argv, char **fin, char **fout)
 
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			usage(argv);
+			usage();
 			exit(EXIT_SUCCESS);
 		}
 
@@ -87,7 +87,7 @@ static void parse_args(int argc, char **argv, char **fin, char **fout)
 	if (!*fin)
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
-                usage(argv);
+                usage();
 		exit(77);
 	}
 }

+ 18 - 12
tools/starpu_fxt_tool.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014  Universite de Bordeaux 1
- * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 
 #define PROGNAME "starpu_fxt_tool"
 
-static void usage(char **argv)
+static void usage()
 {
 	fprintf(stderr, "Generate a trace in the Paje format\n\n");
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
@@ -53,40 +53,46 @@ static void parse_args(int argc, char **argv)
 	unsigned reading_input_filenames = 0;
 
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-c") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-c") == 0)
+		{
 			options.per_task_colour = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-o") == 0) {
+		if (strcmp(argv[i], "-o") == 0)
+		{
 			options.out_paje_path = argv[++i];
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-i") == 0) {
+		if (strcmp(argv[i], "-i") == 0)
+		{
 			options.filenames[options.ninputfiles++] = argv[++i];
 			reading_input_filenames = 1;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-no-counter") == 0) {
+		if (strcmp(argv[i], "-no-counter") == 0)
+		{
 			options.no_counter = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-no-bus") == 0) {
+		if (strcmp(argv[i], "-no-bus") == 0)
+		{
 			options.no_bus = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-h") == 0
-		 || strcmp(argv[i], "--help") == 0) {
-			usage(argv);
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			usage();
 			exit(EXIT_SUCCESS);
 		}
 
@@ -109,7 +115,7 @@ static void parse_args(int argc, char **argv)
 	if (!options.ninputfiles)
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
-                usage(argv);
+                usage();
 		exit(77);
 	}
 }

+ 15 - 3
tools/starpu_perfmodel_plot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014  Université de Bordeaux 1
- * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -170,6 +170,18 @@ static void parse_args(int argc, char **argv)
 
 }
 
+static char *replace_char(char *str, char old, char new)
+{
+	char *p = strdup(str);
+	char *ptr = p;
+	while (*ptr)
+	{
+		if (*ptr == old) *ptr = new;
+		ptr ++;
+	}
+	return p;
+}
+
 static void print_comma(FILE *gnuplot_file, int *first)
 {
 	if (*first)
@@ -308,7 +320,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 						if (arch_model->list)
 						{
 							print_comma(gnuplot_file, first);
-							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
+							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
 							col += 2;
 						}
 					}
@@ -503,7 +515,7 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 	fprintf(gnuplot_file, "\n");
 	fprintf(gnuplot_file, "set term postscript eps enhanced color\n");
 	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", symbol);
-	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", symbol);
+	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(symbol, '_', '-'));
 	fprintf(gnuplot_file, "set xlabel \"Total data size\"\n");
 	if (gflops)
 		fprintf(gnuplot_file, "set ylabel \"GFlops\"\n");