瀏覽代碼

Merge @9469:9571

Marc Sergent 12 年之前
父節點
當前提交
f722a8b4c2
共有 74 個文件被更改,包括 916 次插入731 次删除
  1. 5 0
      ChangeLog
  2. 4 1
      configure.ac
  3. 63 8
      doc/chapters/advanced-examples.texi
  4. 79 8
      doc/chapters/api.texi
  5. 2 1
      doc/chapters/basic-examples.texi
  6. 1 0
      doc/chapters/configuration.texi
  7. 4 3
      doc/chapters/perf-optimization.texi
  8. 13 1
      examples/Makefile.am
  9. 150 0
      examples/basic_examples/dynamic_handles.c
  10. 21 14
      examples/cholesky/cholesky.h
  11. 7 2
      examples/cholesky/cholesky_implicit.c
  12. 7 1
      examples/openmp/vector_scal.c
  13. 1 1
      examples/pi/pi.c
  14. 2 0
      include/starpu.h
  15. 3 0
      include/starpu_config.h.in
  16. 2 0
      include/starpu_deprecated_api.h
  17. 0 3
      include/starpu_sched_ctx.h
  18. 26 4
      include/starpu_task.h
  19. 2 2
      include/starpu_task_util.h
  20. 8 0
      include/starpu_top.h
  21. 0 1
      include/starpu_worker.h
  22. 11 6
      mpi/src/starpu_mpi_insert_task.c
  23. 0 1
      src/Makefile.am
  24. 0 10
      src/common/thread.h
  25. 1 0
      src/core/combined_workers.c
  26. 18 11
      src/core/dependencies/data_concurrency.c
  27. 3 3
      src/core/dependencies/implicit_data_deps.c
  28. 14 3
      src/core/jobs.c
  29. 10 0
      src/core/jobs.h
  30. 25 2
      src/core/parallel_task.c
  31. 0 24
      src/core/parallel_task.h
  32. 5 5
      src/core/perfmodel/perfmodel.c
  33. 2 2
      src/core/perfmodel/perfmodel_history.c
  34. 1 1
      src/core/sched_ctx.c
  35. 3 0
      src/core/sched_ctx.h
  36. 16 8
      src/core/sched_policy.c
  37. 35 11
      src/core/task.c
  38. 3 0
      src/core/task.h
  39. 7 0
      src/core/workers.c
  40. 8 7
      src/datawizard/coherency.c
  41. 1 1
      src/datawizard/filters.c
  42. 1 1
      src/datawizard/footprint.c
  43. 16 14
      src/datawizard/reduction.c
  44. 23 11
      src/debug/traces/starpu_fxt.c
  45. 8 6
      src/debug/traces/starpu_paje.c
  46. 1 1
      src/drivers/cpu/driver_cpu.c
  47. 1 1
      src/drivers/cuda/driver_cuda.c
  48. 3 3
      src/drivers/gordon/driver_gordon.c
  49. 2 2
      src/drivers/opencl/driver_opencl.c
  50. 21 15
      src/profiling/bound.c
  51. 118 108
      src/sched_policies/deque_modeling_policy_data_aware.c
  52. 3 4
      src/sched_policies/deque_queues.c
  53. 0 1
      src/sched_policies/deque_queues.h
  54. 1 2
      src/sched_policies/detect_combined_workers.c
  55. 2 15
      src/sched_policies/eager_central_policy.c
  56. 0 13
      src/sched_policies/eager_central_priority_policy.c
  57. 0 2
      src/sched_policies/fifo_queues.h
  58. 40 61
      src/sched_policies/parallel_eager.c
  59. 17 40
      src/sched_policies/parallel_heft.c
  60. 1 16
      src/sched_policies/random_policy.c
  61. 0 1
      src/sched_policies/stack_queues.h
  62. 0 15
      src/sched_policies/work_stealing_policy.c
  63. 0 11
      src/starpu_parameters.h
  64. 2 2
      src/top/starpu_top_core.h
  65. 6 3
      src/top/starpu_top_task.c
  66. 2 2
      src/util/starpu_data_cpy.c
  67. 12 6
      src/util/starpu_insert_task.c
  68. 19 14
      src/util/starpu_insert_task_utils.c
  69. 4 4
      src/util/starpu_insert_task_utils.h
  70. 1 0
      tests/Makefile.am
  71. 2 6
      tests/main/insert_task.c
  72. 5 3
      tools/Makefile.am
  73. 0 156
      tools/cbc2paje.c
  74. 42 47
      tools/lp2paje.c

+ 5 - 0
ChangeLog

@@ -119,6 +119,8 @@ New features:
     pthread API. It is provided with 2 implementations: a pthread one
     and a Simgrid one. Applications using StarPU and wishing to use
     the Simgrid StarPU features should use it.
+  * Allow to have a dynamically allocated number of buffers per task,
+    and so overwrite the value defined --enable-maxbuffers=XXX
 
 Small features:
   * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
@@ -134,6 +136,9 @@ Small features:
   * New configure option --enable-mpi-progression-hook to enable the
     activity polling method for StarPU-MPI.
   * Permit to disable sequential consistency for a given task.
+  * New macro STARPU_RELEASE_VERSION
+  * New function starpu_get_version() to return as 3 integers the
+    release version of StarPU.
 
 Changes:
   * Fix the block filter functions.

+ 4 - 1
configure.ac

@@ -25,11 +25,14 @@ dnl Versioning.
 
 STARPU_MAJOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 1`"
 STARPU_MINOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 2`"
+STARPU_RELEASE_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 3`"
 AC_SUBST([STARPU_MAJOR_VERSION])
 AC_SUBST([STARPU_MINOR_VERSION])
+AC_SUBST([STARPU_RELEASE_VERSION])
 AC_SUBST([STARPU_EFFECTIVE_VERSION])
 AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION], [Major version number of StarPU.])
 AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION], [Minor version number of StarPU.])
+AC_DEFINE_UNQUOTED([STARPU_RELEASE_VERSION], [$STARPU_RELEASE_VERSION], [Release version number of StarPU.])
 
 . "$srcdir/STARPU-VERSION"
 AC_SUBST([LIBSTARPU_INTERFACE_CURRENT])
@@ -264,7 +267,7 @@ AC_MSG_RESULT($max_sched_ctxs)
 AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
 
 AC_ARG_ENABLE([sc_hypervisor],
-  [AS_HELP_STRING([--enable-sct-hypervisor],
+  [AS_HELP_STRING([--enable-sc-hypervisor],
     [enable resizing contexts (experimental)])],
   [enable_sc_hypervisor="yes"],
   [enable_sc_hypervisor="no"])

+ 63 - 8
doc/chapters/advanced-examples.texi

@@ -23,6 +23,7 @@
 * Defining a New Scheduling Policy::
 * On-GPU rendering::
 * Defining a New Data Interface::
+* Setting the Data Handles for a Task::
 * More examples::               More examples shipped with StarPU
 @end menu
 
@@ -473,14 +474,15 @@ probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
 problem to MPS format and then use a better solver, @code{glpsol} might be
 better than @code{lp_solve} for instance (the @code{--pcost} option may be
 useful), but sometimes doesn't manage to converge. @code{cbc} might look
-slower, but it is parallel. Be sure to try at least all the @code{-B} options
-of @code{lp_solve}. For instance, we often just use
-@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
-the @code{-gr} option can also be quite useful.
+slower, but it is parallel. For @code{lp_solve}, be sure to try at least all the
+@code{-B} options. For instance, we often just use @code{lp_solve -cc -B1 -Bb
+-Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and the @code{-gr} option can
+also be quite useful. The resulting schedule can be observed by using the
+@code{starpu_lp2paje} tool, which converts it into the Paje format.
 
 Data transfer time can only be taken into account when @code{deps} is set. Only
 data transfers inferred from implicit data dependencies between tasks are taken
-into account.
+into account. Other data transfers are assumed to be completely overlapped.
 
 Setting @code{deps} to 0 will only take into account the actual computations
 on processing units. It however still properly takes into account the varying
@@ -492,9 +494,6 @@ the priorities as the StarPU scheduler would, i.e. schedule prioritized
 tasks before less prioritized tasks, to check to which extend this results
 to a less optimal solution. This increases even more computation time.
 
-Note that for simplicity, all this however doesn't take into account data
-transfers, which are assumed to be completely overlapped.
-
 @node Insert Task Utility
 @section Insert Task Utility
 
@@ -1264,6 +1263,62 @@ void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
 The whole code for this complex data interface is available in the
 directory @code{examples/interface/}.
+
+@node Setting the Data Handles for a Task
+@section Setting the Data Handles for a Task
+
+The number of data a task can manage is fixed by the
+@code{STARPU_NMAXBUFS} which has a default value which can be changed
+through the configure option @code{--enable-maxbuffers} (see
+@ref{--enable-maxbuffers}).
+
+However, it is possible to define tasks managing more data by using
+the field @code{dyn_handles} when defining a task and the field
+@code{dyn_modes} when defining the corresponding codelet.
+
+@cartouche
+@smallexample
+enum starpu_access_mode modes[STARPU_NMAXBUFS+1] = @{
+	STARPU_R, STARPU_R, ...
+@};
+
+struct starpu_codelet dummy_big_cl =
+@{
+	.cuda_funcs = @{dummy_big_kernel, NULL@},
+	.opencl_funcs = @{dummy_big_kernel, NULL@},
+	.cpu_funcs = @{dummy_big_kernel, NULL@},
+	.nbuffers = STARPU_NMAXBUFS+1,
+	.dyn_modes = modes
+@};
+
+task = starpu_task_create();
+task->cl = &dummy_big_cl;
+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<task->cl->nbuffers ; i++)
+@{
+	task->dyn_handles[i] = handle;
+@}
+starpu_task_submit(task);
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+@{
+	handles[i] = handle;
+@}
+starpu_insert_task(&dummy_big_cl,
+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+		 0);
+@end smallexample
+@end cartouche
+
+The whole code for this complex data interface is available in the
+directory @code{examples/basic_examples/dynamic_handles.c}.
+
 @node More examples
 @section More examples
 

+ 79 - 8
doc/chapters/api.texi

@@ -47,6 +47,14 @@ Define the major version of StarPU
 Define the minor version of StarPU
 @end defmac
 
+@defmac STARPU_RELEASE_VERSION
+Define the release version of StarPU
+@end defmac
+
+@deftypefun void starpu_get_version (int *@var{major}, int *@var{minor}, int *@var{release})
+Return as 3 integers the release version of StarPU.
+@end deftypefun
+
 @node Initialization and Termination
 @section Initialization and Termination
 
@@ -1898,6 +1906,17 @@ exceed @code{STARPU_NMAXBUFS}.
 If unsufficient, this value can be set with the @code{--enable-maxbuffers}
 option when configuring StarPU.
 
+@item @code{enum starpu_access_mode *dyn_modes}
+Is an array of @code{enum starpu_access_mode}. It describes the
+required access modes to the data neeeded by the codelet (e.g.
+@code{STARPU_RW}). The number of entries in this array must be
+specified in the @code{nbuffers} field (defined above).
+This field should be used for codelets having a number of datas
+greater than @code{STARPU_NMAXBUFS} (@pxref{Setting the Data Handles
+for a Task}).
+When defining a codelet, one should either define this field or the
+field @code{modes} defined above. 
+
 @item @code{struct starpu_perfmodel *model} (optional)
 This is a pointer to the task duration performance model associated to this
 codelet. This optional field is ignored when set to @code{NULL} or
@@ -1913,8 +1932,8 @@ involved in the parallel execution.
 @item @code{unsigned long per_worker_stats[STARPU_NMAXWORKERS]} (optional)
 Statistics collected at runtime: this is filled by StarPU and should not be
 accessed directly, but for example by calling the
-@code{starpu_display_codelet_stats} function (See
-@ref{starpu_display_codelet_stats} for details).
+@code{starpu_codelet_display_stats} function (See
+@ref{starpu_codelet_display_stats} for details).
 
 @item @code{const char *name} (optional)
 Define the name of the codelet. This can be useful for debugging purposes.
@@ -1923,6 +1942,7 @@ Define the name of the codelet. This can be useful for debugging purposes.
 @end deftp
 
 @deftypefun void starpu_codelet_init ({struct starpu_codelet} *@var{cl})
+@anchor{starpu_codelet_init}
 Initialize @var{cl} with default values. Codelets should preferably be
 initialized statically as shown in @ref{Defining a Codelet}. However
 such a initialisation is not always possible, e.g. when using C++.
@@ -1983,10 +2003,25 @@ of entries in this array must be specified in the @code{nbuffers} field of the
 If unsufficient, this value can be set with the @code{--enable-maxbuffers}
 option when configuring StarPU.
 
+@item @code{starpu_data_handle_t *dyn_handles}
+Is an array of @code{starpu_data_handle_t}. It specifies the handles
+to the different pieces of data accessed by the task. The number
+of entries in this array must be specified in the @code{nbuffers} field of the
+@code{struct starpu_codelet} structure.
+This field should be used for tasks having a number of datas
+greater than @code{STARPU_NMAXBUFS} (@pxref{Setting the Data Handles
+for a Task}).
+When defining a task, one should either define this field or the
+field @code{handles} defined above.
+
 @item @code{void *interfaces[STARPU_NMAXBUFS]}
 The actual data pointers to the memory node where execution will happen, managed
 by the DSM.
 
+@item @code{void **dyn_interfaces}
+The actual data pointers to the memory node where execution will happen, managed
+by the DSM. Is used when the field @code{dyn_handles} is defined.
+
 @item @code{void *cl_arg} (optional; default: @code{NULL})
 This pointer is passed to the codelet through the second argument
 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
@@ -2134,6 +2169,37 @@ value. This is equivalent to initializing a starpu_task structure with
 the @code{starpu_task_init} function defined above.
 @end defmac
 
+@defmac STARPU_TASK_GET_HANDLE ({struct starpu_task} *@var{task}, int @var{i})
+Return the i-th data handle of the given task. If the task is defined
+with a static or dynamic number of handles, will either return the
+i-th element of the field @code{handles} or the i-th element of the field
+@code{dyn_handles} (@pxref{Setting the Data Handles for a Task})
+@end defmac
+
+@defmac STARPU_TASK_SET_HANDLE ({struct starpu_task} *@var{task}, starpu_data_handle_t @var{handle}, int @var{i})
+Set the i-th data handle of the given task with the given dat handle.
+If the task is defined with a static or dynamic number of handles,
+will either set the i-th element of the field @code{handles} or the
+i-th element of the field @code{dyn_handles} (@pxref{Setting the Data
+Handles for a Task})
+@end defmac
+
+@defmac STARPU_CODELET_GET_MODE ({struct starpu_codelet *}@var{codelet}, int @var{i})
+Return the access mode of the i-th data handle of the given codelet.
+If the codelet is defined with a static or dynamic number of handles,
+will either return the i-th element of the field @code{modes} or the
+i-th element of the field @code{dyn_modes} (@pxref{Setting the Data
+Handles for a Task})
+@end defmac
+
+@defmac STARPU_CODELET_SET_MODE ({struct starpu_codelet *}@var{codelet}codelet, {enum starpu_access_mode} @var{mode}, int @var{i})
+Set the access mode of the i-th data handle of the given codelet.
+If the codelet is defined with a static or dynamic number of handles,
+will either set the i-th element of the field @code{modes} or the
+i-th element of the field @code{dyn_modes} (@pxref{Setting the Data
+Handles for a Task})
+@end defmac
+
 @deftypefun {struct starpu_task *} starpu_task_create (void)
 Allocate a task structure and initialize it with default values. Tasks
 allocated dynamically with @code{starpu_task_create} are automatically freed when the
@@ -2145,6 +2211,10 @@ by the task have to be freed by calling
 @code{starpu_task_destroy}.
 @end deftypefun
 
+@deftypefun {struct starpu_task *}starpu_task_dup ({struct starpu_task *}@var{task})
+Allocate a task structure which is the exact duplicate of the given task.
+@end deftypefun
+
 @deftypefun void starpu_task_clean ({struct starpu_task} *@var{task})
 Release all the structures automatically allocated to execute @var{task}, but
 not the task structure itself and values set by the user remain unchanged.
@@ -2218,8 +2288,8 @@ NULL if it is called either from a thread that is not a task or simply
 because there is no task being executed at the moment.
 @end deftypefun
 
-@deftypefun void starpu_display_codelet_stats ({struct starpu_codelet} *@var{cl})
-@anchor{starpu_display_codelet_stats}
+@deftypefun void starpu_codelet_display_stats ({struct starpu_codelet} *@var{cl})
+@anchor{starpu_codelet_display_stats}
 Output on @code{stderr} some statistics on the codelet @var{cl}.
 @end deftypefun
 
@@ -3650,6 +3720,11 @@ Get the description of a combined worker
 Variant of starpu_worker_can_execute_task compatible with combined workers
 @end deftypefun
 
+@deftypefun void starpu_parallel_task_barrier_init ({struct starpu_task* }@var{task}, int @var{best_workerid})
+Initialise the barrier for the parallel task, and dispatch the task
+between the different combined workers
+@end deftypefun
+
 @deftp {Data Type} {struct starpu_machine_topology}
 @table @asis
 @item @code{unsigned nworkers}
@@ -3776,10 +3851,6 @@ Delete the worker collection of the specified scheduling context
 Return the worker collection managed by the indicated context
 @end deftypefun
 
-@deftypefun pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
-TODO
-@end deftypefun
-
 @deftypefun void starpu_sched_ctx_set_context (unsigned *@var{sched_ctx_id})
 Set the scheduling context the subsequent tasks will be submitted to
 @end deftypefun

+ 2 - 1
doc/chapters/basic-examples.texi

@@ -140,7 +140,8 @@ struct starpu_codelet cl =
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
 (e.g. CUDA, x86, ...). For compatibility, make sure that the whole
-structure is initialized to zero, either by using memset, or by letting the
+structure is properly initialized to zero, either by using the
+function starpu_codelet_init (@pxref{starpu_codelet_init}), or by letting the
 compiler implicitly do it as examplified above.
 
 The @code{nbuffers} field specifies the number of data buffers that are

+ 1 - 0
doc/chapters/configuration.texi

@@ -234,6 +234,7 @@ Enable gathering of various data statistics (@pxref{Data statistics}).
 @end defvr
 
 @defvr {Configure option} --enable-maxbuffers
+@anchor{--enable-maxbuffers}
 Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
 @end defvr

+ 4 - 3
doc/chapters/perf-optimization.texi

@@ -409,9 +409,10 @@ STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
 
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 communications and the codelet computations if the application does not use a
-dedicated CUDA stream for its computations. StarPU provides one by the use of
-@code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
-operations. For instance:
+dedicated CUDA stream for its computations instead of the default stream,
+which synchronizes all operations of the GPU. StarPU provides one by the use
+of @code{starpu_cuda_get_local_stream()} which can be used by all CUDA codelet
+operations to avoid this issue. For instance:
 
 @cartouche
 @smallexample

+ 13 - 1
examples/Makefile.am

@@ -52,7 +52,6 @@ EXTRA_DIST = 					\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
 	basic_examples/block_opencl_kernel.cl			\
-	openmp/vector_scal.c			\
 	filters/fblock_opencl_kernel.cl		\
 	filters/custom_mf/conversion_opencl.cl  \
 	filters/custom_mf/custom_opencl.cl \
@@ -159,6 +158,7 @@ examplebin_PROGRAMS +=				\
 	basic_examples/block			\
 	basic_examples/variable			\
 	basic_examples/multiformat              \
+	basic_examples/dynamic_handles		\
 	cpp/incrementer_cpp			\
 	filters/custom_mf/custom_mf_filter      \
 	filters/fvector				\
@@ -876,6 +876,18 @@ pipeline_pipeline_LDADD =		\
 	$(STARPU_BLAS_LDFLAGS)
 endif
 
+##################
+# openmp example #
+##################
+
+if !STARPU_HAVE_WINDOWS
+examplebin_PROGRAMS +=		\
+	openmp/vector_scal_omp
+
+openmp_vector_scal_omp_CFLAGS = \
+	$(AM_CFLAGS) -fopenmp
+endif
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \

+ 150 - 0
examples/basic_examples/dynamic_handles.c

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+static void dummy_small_kernel(void *descr[], void *cl_arg)
+{
+	int nb_data;
+	int i;
+
+	starpu_codelet_unpack_args(cl_arg, &nb_data);
+	assert(nb_data == 1);
+	FPRINTF(stderr, "Number of data: %d\n", nb_data);
+
+	for(i=0 ; i<nb_data; i++)
+	{
+		int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		assert(*val == 42);
+	}
+}
+
+static void dummy_big_kernel(void *descr[], void *cl_arg)
+{
+	int nb_data;
+	int i;
+
+	starpu_codelet_unpack_args(cl_arg, &nb_data);
+	assert(nb_data == 9);
+	FPRINTF(stderr, "Number of data: %d\n", nb_data);
+
+	for(i=0 ; i<nb_data; i++)
+	{
+		int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[i]);
+		assert(*val == 42);
+	}
+}
+
+static struct starpu_codelet dummy_small_cl =
+{
+	.cuda_funcs = {dummy_small_kernel, NULL},
+	.opencl_funcs = {dummy_small_kernel, NULL},
+	.cpu_funcs = {dummy_small_kernel, NULL},
+	.modes = {STARPU_RW},
+	.nbuffers = 1
+};
+
+struct starpu_codelet dummy_big_cl =
+{
+	.cuda_funcs = {dummy_big_kernel, NULL},
+	.opencl_funcs = {dummy_big_kernel, NULL},
+	.cpu_funcs = {dummy_big_kernel, NULL},
+	.nbuffers = STARPU_NMAXBUFS+1
+};
+
+int main(int argc, char **argv)
+{
+	starpu_data_handle_t handle, *handles;
+	int ret;
+	int val=42;
+	unsigned i;
+	struct starpu_task *task, *task2;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	dummy_big_cl.dyn_modes = malloc(dummy_big_cl.nbuffers * sizeof(enum starpu_access_mode));
+	for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+	     dummy_big_cl.dyn_modes[i] = STARPU_RW;
+
+	starpu_variable_data_register(&handle, 0, (uintptr_t)&val, sizeof(int));
+
+	task = starpu_task_create();
+	task->synchronous = 1;
+	task->cl = &dummy_small_cl;
+	starpu_codelet_pack_args(&task->cl_arg, &task->cl_arg_size,
+				 STARPU_VALUE, &(task->cl->nbuffers), sizeof(task->cl->nbuffers),
+				 0);
+	task->dyn_handles = malloc(sizeof(starpu_data_handle_t));
+	task->dyn_handles[0] = handle;
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	task2 = starpu_task_create();
+	task2->synchronous = 1;
+	task2->cl = &dummy_big_cl;
+	starpu_codelet_pack_args(&task2->cl_arg, &task2->cl_arg_size,
+				 STARPU_VALUE, &task2->cl->nbuffers, sizeof(task2->cl->nbuffers),
+				 0);
+	task2->dyn_handles = malloc(task2->cl->nbuffers * sizeof(starpu_data_handle_t));
+	for(i=0 ; i<task2->cl->nbuffers ; i++)
+	{
+		task2->dyn_handles[i] = handle;
+	}
+	ret = starpu_task_submit(task2);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	ret = starpu_insert_task(&dummy_small_cl,
+				 STARPU_VALUE, &dummy_small_cl.nbuffers, sizeof(dummy_small_cl.nbuffers),
+				 STARPU_RW, handle,
+				 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+        ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
+	for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+	{
+		handles[i] = handle;
+	}
+	ret = starpu_insert_task(&dummy_big_cl,
+				 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+				 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+				 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+        ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	free(handles);
+
+	starpu_data_unregister(handle);
+	free(dummy_big_cl.dyn_modes);
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	free(dummy_big_cl.dyn_modes);
+	starpu_shutdown();
+	return 77;
+}

+ 21 - 14
examples/cholesky/cholesky.h

@@ -122,6 +122,7 @@ static unsigned check = 0;
 static unsigned bound = 0;
 static unsigned bound_deps = 0;
 static unsigned bound_lp = 0;
+static unsigned bound_mps = 0;
 static unsigned with_ctxs = 0;
 static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
@@ -150,77 +151,83 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 		{
 			with_ctxs = 1;
 			break;
-		}
+		} else
 		if (strcmp(argv[i], "-with_noctxs") == 0) 
 		{
 			with_noctxs = 1;
 			break;
-		}
+		} else
 		
 		if (strcmp(argv[i], "-chole1") == 0) 
 		{
 			chole1 = 1;
 			break;
-		}
+		} else
 
 		if (strcmp(argv[i], "-chole2") == 0) 
 		{
 			chole2 = 1;
 			break;
-		}
+		} else
 
 		if (strcmp(argv[i], "-size") == 0)
 		{
 		        char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
-		}
+		} else
 
 		if (strcmp(argv[i], "-nblocks") == 0)
 		{
 		        char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
-		}
+		} else
 
 		if (strcmp(argv[i], "-nbigblocks") == 0)
 		{
 		        char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
-		}
+		} else
 
 		if (strcmp(argv[i], "-no-pin") == 0)
 		{
 			pinned = 0;
-		}
+		} else
 
 		if (strcmp(argv[i], "-no-prio") == 0)
 		{
 			noprio = 1;
-		}
+		} else
 
 		if (strcmp(argv[i], "-bound") == 0)
 		{
 			bound = 1;
-		}
+		} else
 
 		if (strcmp(argv[i], "-bound-lp") == 0)
 		{
 			bound_lp = 1;
-		}
+		} else
+
+		if (strcmp(argv[i], "-bound-mps") == 0)
+		{
+			bound_mps = 1;
+		} else
 
 		if (strcmp(argv[i], "-bound-deps") == 0)
 		{
 			bound_deps = 1;
-		}
+		} else
 
 		if (strcmp(argv[i], "-check") == 0)
 		{
 			check = 1;
-		}
+		} else
 
-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
+		/* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0) */
 		{
 			fprintf(stderr,"usage : %s [-size size] [-nblocks nblocks] [-no-pin] [-no-prio] [-bound] [-bound-deps] [-bound-lp] [-check]\n", argv[0]);
 			fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
+			exit(0);
 		}
 	}
 }

+ 7 - 2
examples/cholesky/cholesky_implicit.c

@@ -89,7 +89,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	start = starpu_timing_now();
 
-	if (bound)
+	if (bound || bound_lp || bound_mps)
 		starpu_bound_start(bound_deps, 0);
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
@@ -140,7 +140,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	starpu_task_wait_for_all();
-	if (bound)
+	if (bound || bound_lp || bound_mps)
 		starpu_bound_stop();
 
 	end = starpu_timing_now();
@@ -162,6 +162,11 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			FILE *f = fopen("cholesky.lp", "w");
 			starpu_bound_print_lp(f);
 		}
+		if (bound_mps)
+		{
+			FILE *f = fopen("cholesky.mps", "w");
+			starpu_bound_print_mps(f);
+		}
 		if (bound)
 		{
 			double res;

+ 7 - 1
examples/openmp/vector_scal.c

@@ -25,7 +25,12 @@
 #include <stdio.h>
 #include <limits.h>
 
+#ifdef STARPU_QUICK_CHECK
+#define	NX	2048
+#else
 #define	NX	2048000
+#endif
+
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 void scal_cpu_func(void *buffers[], void *_args)
@@ -94,7 +99,8 @@ int main(int argc, char **argv)
 
 	float factor = 1.001;
 
-	for (i = 0; i < 100; i++) {
+	for (i = 0; i < 100; i++)
+	{
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &cl;

+ 1 - 1
examples/pi/pi.c

@@ -198,7 +198,7 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
 	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
 
-	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&pi_cl);
+	if (!getenv("STARPU_SSILENT")) starpu_codelet_display_stats(&pi_cl);
 
 	starpu_shutdown();
 

+ 2 - 0
include/starpu.h

@@ -153,6 +153,8 @@ int starpu_asynchronous_opencl_copy_disabled(void);
 void starpu_profiling_init();
 void starpu_display_stats();
 
+void starpu_get_version(int *major, int *minor, int *release);
+
 #ifdef __cplusplus
 }
 #endif

+ 3 - 0
include/starpu_config.h.in

@@ -20,6 +20,7 @@
 
 #undef STARPU_MAJOR_VERSION
 #undef STARPU_MINOR_VERSION
+#undef STARPU_RELEASE_VERSION
 
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
@@ -113,4 +114,6 @@ struct timespec
 #undef STARPU_HAVE_RINTF
 #undef STARPU_USE_TOP
 
+#undef STARPU_HAVE_HWLOC
+
 #endif

+ 2 - 0
include/starpu_deprecated_api.h

@@ -88,6 +88,8 @@ typedef enum starpu_access_mode starpu_access_mode;
 #define starpu_depth_block_filter_func_block		starpu_block_filter_depth_block
 #define starpu_depth_block_shadow_filter_func_block	starpu_block_filter_depth_block_shadow
 
+#define starpu_display_codelet_stats		starpu_codelet_display_stats
+
 #endif /* STARPU_USE_DEPRECATED_ONE_ZERO_API */
 
 #ifdef __cplusplus

+ 0 - 3
include/starpu_sched_ctx.h

@@ -48,9 +48,6 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id);
 /* indicate which context whill inherit the resources of this context when he will be deleted */
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 
-/* mutex synchronising several simultaneous modifications of a context */
-starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
-
 /* indicate that the current thread is submitting only to the current context */
 void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
 

+ 26 - 4
include/starpu_task.h

@@ -96,6 +96,7 @@ struct starpu_codelet
 	unsigned nbuffers;
 	/* which are the access modes for these buffers */
 	enum starpu_access_mode modes[STARPU_NMAXBUFS];
+	enum starpu_access_mode *dyn_modes;
 
 	/* performance model of the codelet */
 	struct starpu_perfmodel *model;
@@ -104,7 +105,7 @@ struct starpu_codelet
 	struct starpu_perfmodel *power_model;
 
 	/* statistics collected at runtime: this is filled by StarPU and should
-	 * not be accessed directly (use the starpu_display_codelet_stats
+	 * not be accessed directly (use the starpu_codelet_display_stats
 	 * function instead for instance). */
 	unsigned long per_worker_stats[STARPU_NMAXWORKERS];
 
@@ -120,6 +121,9 @@ struct starpu_task
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
 
+	starpu_data_handle_t *dyn_handles;
+	void **dyn_interfaces;
+
 	/* arguments not managed by the DSM are given as a buffer */
 	void *cl_arg;
 	/* in case the argument buffer has to be uploaded explicitely */
@@ -240,9 +244,17 @@ struct starpu_task
 	.sched_ctx = 0,					\
 	.hypervisor_tag = 0,				\
 	.flops = 0.0,					\
-		.scheduled = 0				\
+	.scheduled = 0,					\
+	.dyn_handles = NULL,				\
+	.dyn_interfaces = NULL				\
 }
 
+#define STARPU_TASK_GET_HANDLE(task, i) ((task->dyn_handles) ? task->dyn_handles[i] : task->handles[i])
+#define STARPU_TASK_SET_HANDLE(task, handle, i) do { if (task->dyn_handles) task->dyn_handles[i] = handle; else task->handles[i] = handle; } while(0)
+
+#define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
+#define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
+
 /*
  * handle task dependencies: it is possible to associate a task with a unique
  * "tag" and to express dependencies between tasks by the means of those tags
@@ -317,10 +329,13 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
  * indicates that the waited task was either synchronous or detached. */
 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
-/* This function waits until all the tasks that were already submitted have
+/* This function waits until all the tasks that were already submitted 
+ * (to the current context or the global one if there aren't any) have
  * been executed. */
 int starpu_task_wait_for_all(void);
 
+/* This function waits until all the tasks that were already submitted to the 
+ * context have been executed */
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
 
 /* This function waits until there is no more ready task. */
@@ -331,13 +346,20 @@ int starpu_task_nsubmitted(void);
 
 void starpu_codelet_init(struct starpu_codelet *cl);
 
-void starpu_display_codelet_stats(struct starpu_codelet *cl);
+void starpu_codelet_display_stats(struct starpu_codelet *cl);
 
 /* Return the task currently executed by the worker, or NULL if this is called
  * either from a thread that is not a task or simply because there is no task
  * being executed at the moment. */
 struct starpu_task *starpu_task_get_current(void);
 
+/* initialise the barrier for the parallel task, st all workers start it 
+ * at the same time */
+void starpu_parallel_task_barrier_init(struct starpu_task* task, int workerid);
+
+/* duplicate the given task */
+struct starpu_task *starpu_task_dup(struct starpu_task *task);
+
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
include/starpu_task_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,7 +57,7 @@ void starpu_codelet_unpack_args(void *cl_arg, ...);
 
 /* Pack arguments of type STARPU_VALUE into a buffer which can be
  * given to a codelet and later unpacked with starpu_codelet_unpack_args */
-void starpu_codelet_pack_args(char **arg_buffer, size_t *arg_buffer_size, ...);
+void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...);
 
 #ifdef __cplusplus
 }

+ 8 - 0
include/starpu_top.h

@@ -195,6 +195,14 @@ void starpu_top_update_data_float(const struct starpu_top_data* data,
 				  double value);
 
 /*
+ * This function notifies UI than the task have been planed to
+ * run from start to end, on computation-core
+ */
+void starpu_top_task_prevision(struct starpu_task *task,
+			       int devid, unsigned long long start,
+			       unsigned long long end);
+
+/*
  * This functions are usefull in debug mode. The starpu developper doesn't need
  * to check if the debug mode is active.
  * This is checked by starpu_top itsefl.

+ 0 - 1
include/starpu_worker.h

@@ -123,7 +123,6 @@ int starpu_combined_worker_get_id(void);
 int starpu_combined_worker_get_size(void);
 int starpu_combined_worker_get_rank(void);
 
-
 /* This function returns the type of worker associated to an identifier (as
  * returned by the starpu_worker_get_id function). The returned value indicates
  * the architecture of the worker: STARPU_CPU_WORKER for a CPU core,

+ 11 - 6
mpi/src/starpu_mpi_insert_task.c

@@ -24,6 +24,7 @@
 #include <common/uthash.h>
 #include <util/starpu_insert_task_utils.h>
 #include <datawizard/coherency.h>
+#include <core/task.h>
 
 #include <starpu_mpi_private.h>
 
@@ -369,7 +370,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	int me, do_execute, xrank, nb_nodes;
 	size_t *size_on_nodes;
 	size_t arg_buffer_size = 0;
-	char *arg_buffer = NULL;
+	void *arg_buffer = NULL;
 	int dest=0, inconsistent_execute;
 	int current_data = 0;
 
@@ -420,7 +421,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			int i;
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				enum starpu_access_mode mode = codelet->modes[current_data];
+				enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(codelet, current_data);
 				int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
 				if (ret == -EINVAL)
 				{
@@ -531,7 +532,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				_starpu_mpi_exchange_data_before_execution(datas[i], codelet->modes[current_data], me, dest, do_execute, comm);
+				_starpu_mpi_exchange_data_before_execution(datas[i], STARPU_CODELET_GET_MODE(codelet, current_data), me, dest, do_execute, comm);
 				current_data++;
 			}
 		}
@@ -590,12 +591,16 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		if (arg_buffer_size)
 		{
 			va_start(varg_list, codelet);
-			_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
+			_starpu_codelet_pack_args(&arg_buffer, arg_buffer_size, varg_list);
 		}
 
 		_STARPU_MPI_DEBUG(1, "Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		va_start(varg_list, codelet);
 		struct starpu_task *task = starpu_task_create();
+		if (codelet->nbuffers > STARPU_NMAXBUFS)
+		{
+			task->dyn_handles = malloc(codelet->nbuffers * sizeof(starpu_data_handle_t));
+		}
 		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
 		STARPU_ASSERT_MSG(ret==0, "_starpu_insert_task_create_and_submit failure %d", ret);
 	}
@@ -622,7 +627,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 				for(i=0 ; i<nb_handles ; i++)
 				{
-					_starpu_mpi_exchange_data_after_execution(datas[i], codelet->modes[current_data], me, xrank, dest, do_execute, comm);
+					_starpu_mpi_exchange_data_after_execution(datas[i], STARPU_CODELET_GET_MODE(codelet, current_data), me, xrank, dest, do_execute, comm);
 					current_data++;
 				}
 			}
@@ -692,7 +697,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				_starpu_mpi_clear_data_after_execution(datas[i], codelet->modes[current_data], me, do_execute, comm);
+				_starpu_mpi_clear_data_after_execution(datas[i], STARPU_CODELET_GET_MODE(codelet, current_data), me, do_execute, comm);
 				current_data++;
 			}
 		}

+ 0 - 1
src/Makefile.am

@@ -73,7 +73,6 @@ noinst_HEADERS = 						\
 	core/debug.h						\
 	core/errorcheck.h					\
 	core/combined_workers.h					\
-	core/parallel_task.h					\
 	core/simgrid.h						\
 	core/task_bundle.h					\
 	sched_policies/detect_combined_workers.h		\

+ 0 - 10
src/common/thread.h

@@ -70,16 +70,6 @@
 	}                                                                      \
 } while (0)
 
-#define _STARPU_PTHREAD_MUTEX_TRYLOCK(mutex) do {                              \
-	int p_ret = starpu_pthread_mutex_trylock(mutex);                       \
-	if (STARPU_UNLIKELY(p_ret)) {                                          \
-		fprintf(stderr,                                                \
-			"%s:%d starpu_pthread_mutex_trylock: %s\n",            \
-			__FILE__, __LINE__, strerror(p_ret));                  \
-		STARPU_ABORT();                                                \
-	}                                                                      \
-} while (0)
-
 #define _STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
 	int p_ret = starpu_pthread_mutex_unlock(mutex);                        \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \

+ 1 - 0
src/core/combined_workers.c

@@ -162,3 +162,4 @@ int starpu_combined_worker_get_description(int workerid, int *worker_size, int *
 
 	return 0;
 }
+

+ 18 - 11
src/core/dependencies/data_concurrency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -192,8 +192,8 @@ static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, u
 {
 	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	 * which is a sorted copy of it. */
-	starpu_data_handle_t handle = j->ordered_buffers[buffer_index].handle;
-	enum starpu_access_mode mode = j->ordered_buffers[buffer_index].mode;
+	starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buffer_index);
+	enum starpu_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index);
 
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 }
@@ -205,11 +205,16 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 	unsigned nbuffers = j->task->cl->nbuffers;
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	{
-		if (buf && j->ordered_buffers[buf-1].handle == j->ordered_buffers[buf].handle)
-			/* We have already requested this data, skip it. This
-			 * depends on ordering putting writes before reads, see
-			 * _starpu_compar_handles.  */
-			continue;
+		if (buf)
+		{
+			starpu_data_handle_t handle_m1 = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf-1);
+			starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buf);
+			if (handle_m1 == handle)
+				/* We have already requested this data, skip it. This
+				 * depends on ordering putting writes before reads, see
+				 * _starpu_compar_handles.  */
+				continue;
+		}
 
                 j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
                 if (attempt_to_submit_data_request_from_job(j, buf))
@@ -238,11 +243,13 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 	unsigned i;
 	for (i=0 ; i<cl->nbuffers ; i++)
 	{
-		j->ordered_buffers[i].handle = j->task->handles[i];
-		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 
-	_starpu_sort_task_handles(j->ordered_buffers, cl->nbuffers);
+	_starpu_sort_task_handles(_STARPU_JOB_GET_ORDERED_BUFFERS(j), cl->nbuffers);
 
 	return _submit_job_enforce_data_deps(j, 0);
 }

+ 3 - 3
src/core/dependencies/implicit_data_deps.c

@@ -336,8 +336,8 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		starpu_data_handle_t handle = task->handles[buffer];
-		enum starpu_access_mode mode = task->cl->modes[buffer];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
 		struct starpu_task *new_task;
 
 		/* Scratch memory does not introduce any deps */
@@ -457,7 +457,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
-        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 
 	if (!task->cl)
 		return;

+ 14 - 3
src/core/jobs.c

@@ -52,6 +52,9 @@ struct _starpu_job* __attribute__((malloc)) _starpu_job_create(struct starpu_tas
 	 * everywhere */
 	memset(job, 0, sizeof(*job));
 
+	if (task->dyn_handles)
+	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(struct starpu_buffer_descr));
+
 	job->task = task;
 
 #ifndef STARPU_USE_FXT
@@ -104,6 +107,11 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	}
 
 	_starpu_cg_list_deinit(&j->job_successors);
+	if (j->dyn_ordered_buffers)
+	{
+	     free(j->dyn_ordered_buffers);
+	     j->dyn_ordered_buffers = NULL;
+	}
 
 	_starpu_job_delete(j);
 }
@@ -149,8 +157,11 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	int i;
 	size_t data_size = 0;
 	for(i = 0; i < STARPU_NMAXBUFS; i++)
-		if(task->handles[i] != NULL)
-			data_size += _starpu_data_get_size(task->handles[i]);
+	{
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
+		if (handle != NULL)
+			data_size += _starpu_data_get_size(handle);
+	}
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	/* We release handle reference count */
@@ -159,7 +170,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		unsigned i;
 		for (i=0; i<task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
 			handle->busy_count--;
 			if (!_starpu_data_check_not_busy(handle))

+ 10 - 0
src/core/jobs.h

@@ -70,6 +70,7 @@ LIST_TYPE(_starpu_job,
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
 	struct starpu_buffer_descr ordered_buffers[STARPU_NMAXBUFS];
+	struct starpu_buffer_descr *dyn_ordered_buffers;
 
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
@@ -172,4 +173,13 @@ struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker);
  * enforce a FIFO ordering. */
 int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 
+#define _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].handle : job->ordered_buffers[i].handle)
+#define _STARPU_JOB_GET_ORDERED_BUFFER_MODE(job, i) ((job->dyn_ordered_buffers) ? job->dyn_ordered_buffers[i].mode : job->ordered_buffers[i].mode)
+
+#define _STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(job, handle, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].handle = (handle); else job->ordered_buffers[i].handle = (handle);} while(0)
+#define _STARPU_JOB_SET_ORDERED_BUFFER_MODE(job, mode, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i].mode = mode; else job->ordered_buffers[i].mode = mode;} while(0)
+
+#define _STARPU_JOB_SET_ORDERED_BUFFER(job, buffer, i) do { if (job->dyn_ordered_buffers) job->dyn_ordered_buffers[i] = buffer; else job->ordered_buffers[i] = buffer;} while(0)
+#define _STARPU_JOB_GET_ORDERED_BUFFERS(job) (job->dyn_ordered_buffers) ? job->dyn_ordered_buffers : job->ordered_buffers
+
 #endif // __JOBS_H__

+ 25 - 2
src/core/parallel_task.c

@@ -19,15 +19,38 @@
 #include <core/jobs.h>
 #include <core/task.h>
 #include <common/utils.h>
+#include <core/workers.h>
+#include <common/barrier.h>
 
-struct starpu_task *_starpu_create_task_alias(struct starpu_task *task)
+struct starpu_task *starpu_task_dup(struct starpu_task *task)
 {
 	struct starpu_task *task_dup = (struct starpu_task *) malloc(sizeof(struct starpu_task));
 	STARPU_ASSERT(task_dup);
 
-	/* XXX perhaps this is a bit too much overhead and we should only copy
+	/* TODO perhaps this is a bit too much overhead and we should only copy
 	 * part of the structure ? */
 	memcpy(task_dup, task, sizeof(struct starpu_task));
 
 	return task_dup;
 }
+
+void starpu_parallel_task_barrier_init(struct starpu_task* task, int workerid)
+{
+	/* The master needs to dispatch the task between the
+	 * different combined workers */
+	struct _starpu_combined_worker *combined_worker =  _starpu_get_combined_worker_struct(workerid);
+	int worker_size = combined_worker->worker_size;
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+	j->task_size = worker_size;
+	j->combined_workerid = workerid;
+	j->active_task_alias_count = 0;
+
+	//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
+
+	_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
+	_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+
+	return;
+}
+

+ 0 - 24
src/core/parallel_task.h

@@ -1,24 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __PARALLEL_TASK_H__
-#define __PARALLEL_TASK_H__
-
-#include <starpu.h>
-
-struct starpu_task *_starpu_create_task_alias(struct starpu_task *task);
-
-#endif /* __PARALLEL_TASK_H__ */

+ 5 - 5
src/core/perfmodel/perfmodel.c

@@ -227,7 +227,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
 
-		handle = task->handles[i];
+		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 
@@ -287,8 +287,8 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		starpu_data_handle_t handle = task->handles[buffer];
-		enum starpu_access_mode mode = task->cl->modes[buffer];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
@@ -375,8 +375,8 @@ double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundl
 			unsigned b;
 			for (b = 0; b < task->cl->nbuffers; b++)
 			{
-				starpu_data_handle_t handle = task->handles[b];
-				enum starpu_access_mode mode = task->cl->modes[b];
+				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, b);
+				enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, b);
 
 				if (!(mode & STARPU_R))
 					continue;

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -72,7 +72,7 @@ size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_per
 		unsigned buffer;
 		for (buffer = 0; buffer < nbuffers; buffer++)
 		{
-			starpu_data_handle_t handle = task->handles[buffer];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 			size += _starpu_data_get_size(handle);
 		}
 		return size;
@@ -1267,7 +1267,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 		for (i = 0; i < task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 
 			STARPU_ASSERT(handle->ops);
 			STARPU_ASSERT(handle->ops->display);

+ 1 - 1
src/core/sched_ctx.c

@@ -885,7 +885,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	return npus;
 }
 
-starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
+starpu_pthread_mutex_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
 {
 	return &changing_ctx_mutex[sched_ctx_id];
 }

+ 3 - 0
src/core/sched_ctx.h

@@ -144,6 +144,9 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 /* Check if the worker belongs to another sched_ctx */
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
+/* mutex synchronising several simultaneous modifications of a context */
+starpu_pthread_mutex_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_call_poped_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);

+ 16 - 8
src/core/sched_policy.c

@@ -23,7 +23,6 @@
 #include <profiling/profiling.h>
 #include <common/barrier.h>
 #include <core/debug.h>
-#include <core/parallel_task.h>
 
 static int use_prefetch = 0;
 
@@ -236,7 +235,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 				struct starpu_task *conversion_task;
 				starpu_data_handle_t handle;
 
-				handle = task->handles[i];
+				handle = STARPU_TASK_GET_HANDLE(task, i);
 				if (!_starpu_handle_needs_conversion_task(handle, node))
 					continue;
 
@@ -249,7 +248,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			}
 
 			for (i = 0; i < task->cl->nbuffers; i++)
-				task->handles[i]->mf_node = node;
+			{
+				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
+				handle->mf_node = node;
+			}
 		}
 //		if(task->sched_ctx != _starpu_get_initial_sched_ctx()->id)
 
@@ -281,7 +283,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		int j;
 		for (j = 0; j < worker_size; j++)
 		{
-			struct starpu_task *alias = _starpu_create_task_alias(task);
+			struct starpu_task *alias = starpu_task_dup(task);
 
 			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
@@ -396,7 +398,13 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 	else
 	{
 		STARPU_ASSERT(sched_ctx->sched_policy->push_task);
-		ret = sched_ctx->sched_policy->push_task(task);
+		/* check out if there are any workers in the context */
+		starpu_pthread_mutex_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
+		_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+		nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
+		ret = nworkers == 0 ? -1 : sched_ctx->sched_policy->push_task(task);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+
 		if(ret == -1)
 		{
 			fprintf(stderr, "repush task \n");
@@ -441,7 +449,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
 	conversion_task = starpu_task_create();
 	conversion_task->synchronous = 0;
-	conversion_task->handles[0] = handle;
+	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	/* The node does not really matter here */
@@ -504,7 +512,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		STARPU_ABORT();
 	}
 
-	conversion_task->cl->modes[0] = STARPU_RW;
+	STARPU_CODELET_SET_MODE(conversion_task->cl, STARPU_RW, 0);
 	return conversion_task;
 }
 
@@ -657,7 +665,7 @@ pick:
 		struct starpu_task *conversion_task;
 		starpu_data_handle_t handle;
 
-		handle = task->handles[i];
+		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_handle_needs_conversion_task(handle, node))
 			continue;
 		conversion_task = _starpu_create_conversion_task(handle, node);

+ 35 - 11
src/core/task.c

@@ -77,6 +77,11 @@ void starpu_task_init(struct starpu_task *task)
 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
 
 	task->flops = 0.0;
+
+	task->scheduled = 0;
+
+	task->dyn_handles = NULL;
+	task->dyn_interfaces = NULL;
 }
 
 /* Free all the ressources allocated for a task, without deallocating the task
@@ -99,6 +104,14 @@ void starpu_task_clean(struct starpu_task *task)
 	if (bundle)
 		starpu_task_bundle_remove(bundle, task);
 
+	if (task->dyn_handles)
+	{
+		free(task->dyn_handles);
+		task->dyn_handles = NULL;
+		free(task->dyn_interfaces);
+		task->dyn_interfaces = NULL;
+	}
+
 	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
 	if (j)
@@ -229,7 +242,7 @@ int _starpu_submit_job(struct _starpu_job *j)
 		unsigned i;
 		for (i=0; i<task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			_starpu_spin_lock(&handle->header_lock);
 			handle->busy_count++;
 			_starpu_spin_unlock(&handle->header_lock);
@@ -393,16 +406,23 @@ int starpu_task_submit(struct starpu_task *task)
 		unsigned i;
 
 		/* Check buffers */
-		STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d)", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+		if (task->dyn_handles == NULL)
+			STARPU_ASSERT_MSG(task->cl->nbuffers <= STARPU_NMAXBUFS, "Codelet %p has too many buffers (%d vs max %d)", task->cl, task->cl->nbuffers, STARPU_NMAXBUFS);
+
+		if (task->dyn_handles)
+		{
+			task->dyn_interfaces = malloc(task->cl->nbuffers * sizeof(void *));
+		}
+
 		for (i = 0; i < task->cl->nbuffers; i++)
 		{
-			starpu_data_handle_t handle = task->handles[i];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data can be used in a task");
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			if (handle->home_node != -1)
-				task->interfaces[i] = starpu_data_get_interface_on_node(task->handles[i], handle->home_node);
+				_STARPU_TASK_SET_INTERFACE(task, starpu_data_get_interface_on_node(handle, handle->home_node), i);
 		}
 
 		/* Check the type of worker(s) required by the task exist */
@@ -526,8 +546,10 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 		unsigned i;
 		for (i=0 ; i<task->cl->nbuffers ; i++)
 		{
-			j->ordered_buffers[i].handle = j->task->handles[i];
-			j->ordered_buffers[i].mode = j->task->cl->modes[i];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+			enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		}
 	}
 
@@ -559,7 +581,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	unsigned i;
 	for (i=0; i<task->cl->nbuffers; i++)
 	{
-		starpu_data_handle_t handle = task->handles[i];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
 		_starpu_spin_lock(&handle->header_lock);
 		handle->busy_count++;
 		_starpu_spin_unlock(&handle->header_lock);
@@ -574,8 +596,10 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	for (i=0 ; i<task->cl->nbuffers ; i++)
 	{
-		j->ordered_buffers[i].handle = j->task->handles[i];
-		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 
         _STARPU_LOG_IN();
@@ -604,7 +628,7 @@ void starpu_codelet_init(struct starpu_codelet *cl)
 	memset(cl, 0, sizeof(struct starpu_codelet));
 }
 
-void starpu_display_codelet_stats(struct starpu_codelet *cl)
+void starpu_codelet_display_stats(struct starpu_codelet *cl)
 {
 	unsigned worker;
 	unsigned nworkers = starpu_worker_get_count();
@@ -811,7 +835,7 @@ _starpu_task_uses_multiformat_handles(struct starpu_task *task)
 	unsigned i;
 	for (i = 0; i < task->cl->nbuffers; i++)
 	{
-		if (_starpu_data_is_multiformat_handle(task->handles[i]))
+		if (_starpu_data_is_multiformat_handle(STARPU_TASK_GET_HANDLE(task, i)))
 			return 1;
 	}
 

+ 3 - 0
src/core/task.h

@@ -73,4 +73,7 @@ starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 
+#define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
+#define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
+
 #endif // __CORE_TASK_H__

+ 7 - 0
src/core/workers.c

@@ -1437,3 +1437,10 @@ starpu_driver_deinit(struct starpu_driver *d)
 		return -EINVAL;
 	}
 }
+
+void starpu_get_version(int *major, int *minor, int *release)
+{
+	*major = STARPU_MAJOR_VERSION;
+	*minor = STARPU_MINOR_VERSION;
+	*release = STARPU_RELEASE_VERSION;
+}

+ 8 - 7
src/datawizard/coherency.c

@@ -22,6 +22,7 @@
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/profiling.h>
 #include <math.h>
+#include <core/task.h>
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
 unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
@@ -591,8 +592,8 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle_t handle = task->handles[index];
-		enum starpu_access_mode mode = task->cl->modes[index];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
@@ -624,7 +625,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
-	struct starpu_buffer_descr *descrs = j->ordered_buffers;
+	struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 	unsigned nbuffers = task->cl->nbuffers;
 
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
@@ -656,14 +657,14 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	/* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order.  */
 	for (index = 0; index < nbuffers; index++)
 	{
-		starpu_data_handle_t handle = task->handles[index];
-		enum starpu_access_mode mode = task->cl->modes[index];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
 
 		struct _starpu_data_replicate *local_replicate;
 
 		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
-		task->interfaces[index] = local_replicate->data_interface;
+		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
 
 		if (mode & STARPU_REDUX)
 		{
@@ -699,7 +700,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
-        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
         unsigned nbuffers = task->cl->nbuffers;
 
 	int workerid = starpu_worker_get_id();

+ 1 - 1
src/datawizard/filters.c

@@ -305,7 +305,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 				.nbuffers = 1
 			};
 			struct starpu_task *task = starpu_task_create();
-			task->handles[0] = child_handle;
+			STARPU_TASK_SET_HANDLE(task, child_handle, 0);
 			task->cl = &cl;
 			task->synchronous = 1;
 			if (_starpu_task_submit_internally(task) != 0)

+ 1 - 1
src/datawizard/footprint.c

@@ -43,7 +43,7 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 	{
 		for (buffer = 0; buffer < task->cl->nbuffers; buffer++)
 		{
-			starpu_data_handle_t handle = task->handles[buffer];
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 
 			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
 

+ 16 - 14
src/datawizard/reduction.c

@@ -217,16 +217,16 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 					redux_task->cl = handle->redux_cl;
 					STARPU_ASSERT(redux_task->cl);
-					if (!redux_task->cl->modes[0])
-						redux_task->cl->modes[0] = STARPU_RW;
-					if (!redux_task->cl->modes[1])
-						redux_task->cl->modes[1] = STARPU_R;
+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
-					STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_RW, "First parameter of reduction codelet has to be RW");
-					STARPU_ASSERT_MSG(redux_task->cl->modes[1] == STARPU_R, "Second parameter of reduction codelet has to be R");
+					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet has to be RW");
+					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet has to be R");
 
-					redux_task->handles[0] = replicate_array[i];
-					redux_task->handles[1] = replicate_array[i+step];
+					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
+					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
 
 					int ndeps = 0;
 					struct starpu_task *task_deps[2];
@@ -278,10 +278,12 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 			redux_task->cl = handle->init_cl;
 			STARPU_ASSERT(redux_task->cl);
-			if (!redux_task->cl->modes[0])
-				redux_task->cl->modes[0] = STARPU_W;
-			STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_W, "Parameter of initialization codelet has to be W");
-			redux_task->handles[0] = handle;
+
+			if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
+				STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_W, 0);
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_W, "Parameter of initialization codelet has to be W");
+
+			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 
 			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);
@@ -311,8 +313,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_RW, "First parameter of reduction codelet has to be RW");
 			STARPU_ASSERT_MSG(redux_task->cl->modes[1] == STARPU_R, "Second parameter of reduction codelet has to be R");
 
-			redux_task->handles[0] = handle;
-			redux_task->handles[1] = replicate_array[replicate];
+			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
+			STARPU_TASK_SET_HANDLE(redux_task, replicate_array[replicate], 1);
 
 			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);

+ 23 - 11
src/debug/traces/starpu_fxt.c

@@ -197,6 +197,12 @@ static char *memnode_container_alias(char *output, int len, const char *prefix,
 	return output;
 }
 
+static char *memmanager_container_alias(char *output, int len, const char *prefix, long unsigned int memnodeid)
+{
+	snprintf(output, len, "%smm%"PRIu64"", prefix, memnodeid);
+	return output;
+}
+
 static char *thread_container_alias(char *output, int len, const char *prefix, long unsigned int threadid)
 {
 	snprintf(output, len, "%st%"PRIu64"", prefix, threadid);
@@ -232,10 +238,10 @@ static void memnode_set_state(double time, const char *prefix, unsigned int memn
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	memnode_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
+	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
 	poti_SetState(time, container, "MS", name);
 #else
-	fprintf(out_paje_file, "10	%.9f	%smn%u	MS	%s\n", time, prefix, memnodeid, name);
+	fprintf(out_paje_file, "10	%.9f	%smm%u	MS	%s\n", time, prefix, memnodeid, name);
 #endif
 }
 
@@ -280,15 +286,21 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		/* TODO: ramkind */
 		snprintf(new_memnode_container_name, STARPU_POTI_STR_LEN, "%sMEMNODE%"PRIu64"", prefix, ev->param[0]);
 		poti_CreateContainer(get_event_time_stamp(ev, options), new_memnode_container_alias, "Mn", program_container, new_memnode_container_name);
+
+		memmanager_container_alias (new_memnode_container_alias, STARPU_POTI_STR_LEN, prefix, ev->param[0]);
+		/* TODO: ramkind */
+		snprintf(new_memnode_container_name, STARPU_POTI_STR_LEN, "%sMEMMANAGER%"PRIu64"", prefix, ev->param[0]);
+		poti_CreateContainer(get_event_time_stamp(ev, options), new_memnode_container_alias, "Mm", program_container, new_memnode_container_name);
 #else
 		fprintf(out_paje_file, "7	%.9f	%smn%"PRIu64"	Mn	%sp	%sMEMNODE%"PRIu64"\n", get_event_time_stamp(ev, options), prefix, ev->param[0], prefix, options->file_prefix, ev->param[0]);
+		fprintf(out_paje_file, "7	%.9f	%smm%"PRIu64"	Mm	%sp	%sMEMMANAGER%"PRIu64"\n", get_event_time_stamp(ev, options), prefix, ev->param[0], prefix, options->file_prefix, ev->param[0]);
 #endif
 
 		if (!options->no_bus)
 #ifdef STARPU_HAVE_POTI
 			poti_SetVariable(get_event_time_stamp(ev, options), new_memnode_container_alias, "bw", 0.0);
 #else
-			fprintf(out_paje_file, "13	%.9f	%smn%"PRIu64"	bw	0.0\n", 0.0f, prefix, ev->param[0]);
+			fprintf(out_paje_file, "13	%.9f	%smm%"PRIu64"	bw	0.0\n", 0.0f, prefix, ev->param[0]);
 #endif
 	}
 }
@@ -703,10 +715,10 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
 			snprintf(paje_key, STARPU_POTI_STR_LEN, "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
-			memnode_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
+			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
 			poti_StartLink(time, program_container, "L", src_memnode_container, paje_value, paje_key);
 #else
-			fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%smn%u	com_%u\n", time, prefix, size, prefix, src, comid);
+			fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%smm%u	com_%u\n", time, prefix, size, prefix, src, comid);
 #endif
 		}
 
@@ -743,10 +755,10 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 			snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
 			snprintf(paje_key, STARPU_POTI_STR_LEN, "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
-			memnode_container_alias(dst_memnode_container, STARPU_POTI_STR_LEN, prefix, dst);
+			memmanager_container_alias(dst_memnode_container, STARPU_POTI_STR_LEN, prefix, dst);
 			poti_EndLink(time, program_container, "L", dst_memnode_container, paje_value, paje_key);
 #else
-			fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%smn%u	com_%u\n", time, prefix, size, prefix, dst, comid);
+			fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%smm%u	com_%u\n", time, prefix, size, prefix, dst, comid);
 #endif
 		}
 
@@ -1187,10 +1199,10 @@ void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 		{
 #ifdef STARPU_HAVE_POTI
 			char src_memnode_container[STARPU_POTI_STR_LEN];
-			memnode_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->src_node);
+			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->src_node);
 			poti_SetVariable(itor->comm_start, src_memnode_container, "bw", current_bandwidth_per_node[itor->src_node]);
 #else
-			fprintf(out_paje_file, "13	%.9f	%smn%u	bw	%f\n",
+			fprintf(out_paje_file, "13	%.9f	%smm%u	bw	%f\n",
 				itor->comm_start, prefix, itor->src_node, current_bandwidth_per_node[itor->src_node]);
 #endif
 		}
@@ -1200,10 +1212,10 @@ void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 		{
 #ifdef STARPU_HAVE_POTI
 			char dst_memnode_container[STARPU_POTI_STR_LEN];
-			memnode_container_alias(dst_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->dst_node);
+			memmanager_container_alias(dst_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->dst_node);
 			poti_SetVariable(itor->comm_start, dst_memnode_container, "bw", current_bandwidth_per_node[itor->dst_node]);
 #else
-			fprintf(out_paje_file, "13	%.9f	%smn%u	bw	%f\n",
+			fprintf(out_paje_file, "13	%.9f	%smm%u	bw	%f\n",
 				itor->comm_start, prefix, itor->dst_node, current_bandwidth_per_node[itor->dst_node]);
 #endif
 		}

+ 8 - 6
src/debug/traces/starpu_paje.c

@@ -137,13 +137,14 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineContainerType("P", "MPIP", "Program");
 	poti_DefineContainerType("Mn", "P", "Memory Node");
 	poti_DefineContainerType("T", "Mn", "Thread");
+	poti_DefineContainerType("Mm", "Mn", "Memory Manager");
 	poti_DefineContainerType("W", "T", "Worker");
 	poti_DefineContainerType("MPICt", "T", "MPI Communication Thread");
 	poti_DefineContainerType("Sc", "P", "Scheduler");
 
 	/* Types for the memory node */
-	poti_DefineVariableType("bw", "Mn", "Bandwidth", "0 0 0");
-	poti_DefineStateType("MS", "Mn", "Memory Node State");
+	poti_DefineVariableType("bw", "Mm", "Bandwidth", "0 0 0");
+	poti_DefineStateType("MS", "Mm", "Memory Node State");
 	poti_DefineEntityValue("A", "MS", "Allocating", ".4 .1 .0");
 	poti_DefineEntityValue("Ar", "MS", "AllocatingReuse", ".1 .1 .8");
 	poti_DefineEntityValue("R", "MS", "Reclaiming", ".0 .1 .4");
@@ -196,7 +197,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
 	/* Link types */
 	poti_DefineLinkType("MPIL", "P", "MPICt", "MPICt", "Links between two MPI Communication Threads");
-	poti_DefineLinkType("L", "P", "Mn", "Mn", "Links between two Memory Nodes");
+	poti_DefineLinkType("L", "P", "Mm", "Mm", "Links between two Memory Managers");
 
 	/* Creating the MPI Program */
 	poti_CreateContainer(0, "MPIroot", "MPIP", "0", "root");
@@ -206,6 +207,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 1       P      MPIP       \"Program\"                      	\n\
 1       Mn      P       \"Memory Node\"                         \n\
 1       T      Mn       \"Thread\"                               \n\
+1       Mm      Mn       \"Memory Manager\"                         \n\
 1       W      T       \"Worker\"                               \n\
 1       MPICt   T       \"MPI Communication Thread\"              \n\
 1       Sc       P       \"Scheduler State\"                        \n\
@@ -216,9 +218,9 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	for (i=1; i<=10; i++)
 		fprintf(file, "3       Ctx%u      T     \"InCtx%u\"         		\n", i, i);
 	fprintf(file, "\
-3       MS       Mn       \"Memory Node State\"                        \n\
+3       MS       Mm       \"Memory Node State\"                        \n\
 4       ntask    Sc       \"Number of tasks\"                        \n\
-4       bw      Mn       \"Bandwidth\"                        \n\
+4       bw      Mm       \"Bandwidth\"                        \n\
 6       I       S      Initializing       \"0.0 .7 1.0\"            \n\
 6       D       S      Deinitializing       \"0.0 .1 .7\"            \n\
 6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n\
@@ -255,7 +257,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       CoA      MS     DriverCopyAsync         \".1 .3 .1\"		\n\
 6       No       MS     Nothing         \".0 .0 .0\"		\n\
 5       MPIL     P	MPICt	MPICt   MPIL			\n\
-5       L       P	Mn	Mn      L\n");
+5       L       P	Mm	Mm      L\n");
 
 	fprintf(file, "7      0.0 MPIroot      MPIP      0       root\n");
 #endif

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -158,7 +158,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 #ifdef STARPU_SIMGRID
 		_starpu_simgrid_execute_job(j, perf_arch, NAN);
 #else
-		func(task->interfaces, task->cl_arg);
+		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -353,7 +353,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 #ifdef STARPU_SIMGRID
 	_starpu_simgrid_execute_job(j, args->perf_arch, NAN);
 #else
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
 	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -102,7 +102,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	unsigned nbuffers = cl->nbuffers;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		enum starpu_access_mode mode = cl->modes[buffer];
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
 
 		switch (mode)
 		{
@@ -122,7 +122,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		unsigned gordon_buffer;
-		enum starpu_access_mode mode = cl->modes[buffer];
+		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(cl, buffer);
 
 		switch (mode)
 		{
@@ -138,7 +138,7 @@ static void starpu_to_gordon_buffers(struct _starpu_job *j, struct gordon_ppu_jo
 				break;
 		}
 
-		starpu_data_handle_t handle = task->handles[buffer];
+		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 
 		gordon_job->nalloc = 0;
 		gordon_job->nin = nin;

+ 2 - 2
src/drivers/opencl/driver_opencl.c

@@ -824,7 +824,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 #ifdef STARPU_SIMGRID
 	double length = NAN;
   #ifdef STARPU_OPENCL_SIMULATOR
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
     #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
       #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
         #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -838,7 +838,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
   #endif
 	_starpu_simgrid_execute_job(j, args->perf_arch, length);
 #else
-	func(task->interfaces, task->cl_arg);
+	func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
 
 	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);

+ 21 - 15
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -501,10 +501,16 @@ void starpu_bound_print_lp(FILE *output)
 		}
 		fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
 		fprintf(output, "/* !! This is a big system, it will be long to solve !! */\n\n");
+
 		fprintf(output, "/* We want to minimize total execution time (ms) */\n");
 		fprintf(output, "min: tmax;\n\n");
 
-		fprintf(output, "/* Which is the maximum of all task completion times (ms) */\n");
+		fprintf(output, "/* Number of tasks */\n");
+		fprintf(output, "nt = %d;\n", nt);
+		fprintf(output, "/* Number of workers */\n");
+		fprintf(output, "nw = %d;\n", nw);
+
+		fprintf(output, "/* The total execution time is the maximum of all task completion times (ms) */\n");
 		for (t1 = tasks; t1; t1 = t1->next)
 			fprintf(output, "c%lu <= tmax;\n", t1->id);
 
@@ -836,12 +842,12 @@ void starpu_bound_print_mps(FILE *output)
 
 		fprintf(output, "NAME           StarPU theoretical bound\n");
 
-		fprintf(output, "\nROWS\n");
+		fprintf(output, "*\nROWS\n");
 
 		fprintf(output, "* We want to minimize total execution time (ms)\n");
 		fprintf(output, " N  TMAX\n");
 
-		fprintf(output, "\n* Which is the maximum of all worker execution times (ms)\n");
+		fprintf(output, "* Which is the maximum of all worker execution times (ms)\n");
 		for (w = 0; w < nw; w++)
 		{
 			char name[32];
@@ -850,36 +856,36 @@ void starpu_bound_print_mps(FILE *output)
 			fprintf(output, " L  W%d\n", w);
 		}
 
-		fprintf(output, "\n* And we have to have computed exactly all tasks\n");
+		fprintf(output, "*\n* And we have to have computed exactly all tasks\n*\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		{
 			fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			fprintf(output, " E  T%d\n", t);
 		}
 
-		fprintf(output, "\nCOLUMNS\n");
+		fprintf(output, "*\nCOLUMNS\n*\n");
 
-		fprintf(output, "\n* Execution times and completion of all tasks\n");
+		fprintf(output, "*\n* Execution times and completion of all tasks\n*\n");
 		for (w = 0; w < nw; w++)
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 				if (!isnan(times[w*nt+t]))
 				{
 					char name[9];
 					snprintf(name, sizeof(name), "W%dT%d", w, t);
-					fprintf(stderr,"    %-8s  W%-7d  %12f\n", name, w, times[w*nt+t]);
-					fprintf(stderr,"    %-8s  T%-7d  %12d\n", name, t, 1);
+					fprintf(output,"    %-8s  W%-7d  %12f\n", name, w, times[w*nt+t]);
+					fprintf(output,"    %-8s  T%-7d  %12d\n", name, t, 1);
 				}
 
-		fprintf(output, "\n* Total execution time\n");
+		fprintf(output, "*\n* Total execution time\n*\n");
 		for (w = 0; w < nw; w++)
-			fprintf(stderr,"    TMAX      W%-2d       %12d\n", w, -1);
-		fprintf(stderr,"    TMAX      TMAX      %12d\n", 1);
+			fprintf(output,"    TMAX      W%-2d       %12d\n", w, -1);
+		fprintf(output,"    TMAX      TMAX      %12d\n", 1);
 
-		fprintf(output, "\nRHS\n");
+		fprintf(output, "*\nRHS\n*\n");
 
-		fprintf(output, "\n* Total number of tasks\n");
+		fprintf(output, "*\n* Total number of tasks\n*\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
-			fprintf(stderr,"    NT%-2d      T%-7d  %12lu\n", t, t, tp->n);
+			fprintf(output,"    NT%-2d      T%-7d  %12lu\n", t, t, tp->n);
 
 		fprintf(output, "ENDATA\n");
 	}

+ 118 - 108
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -27,11 +27,7 @@
 #include <core/workers.h>
 #include <sched_policies/fifo_queues.h>
 #include <core/perfmodel/perfmodel.h>
-#include <starpu_parameters.h>
 #include <core/debug.h>
-#ifdef STARPU_USE_TOP
-#include <top/starpu_top_core.h>
-#endif /* !STARPU_USE_TOP */
 
 #ifndef DBL_MIN
 #define DBL_MIN __DBL_MIN__
@@ -54,12 +50,23 @@ struct _starpu_dmda_data
 	long int ready_task_cnt;
 };
 
-static double alpha = _STARPU_DEFAULT_ALPHA;
-static double beta = _STARPU_DEFAULT_BETA;
-static double _gamma = _STARPU_DEFAULT_GAMMA;
 static double idle_power = 0.0;
 
+/* The dmda scheduling policy uses
+ *
+ * alpha * T_computation + beta * T_communication + gamma * Consumption
+ *
+ * Here are the default values of alpha, beta, gamma
+ */
+
+#define _STARPU_SCHED_ALPHA_DEFAULT 1.0
+#define _STARPU_SCHED_BETA_DEFAULT 1.0
+#define _STARPU_SCHED_GAMMA_DEFAULT 1000.0
+
 #ifdef STARPU_USE_TOP
+static double alpha = _STARPU_SCHED_ALPHA_DEFAULT;
+static double beta = _STARPU_SCHED_BETA_DEFAULT;
+static double _gamma = _STARPU_SCHED_GAMMA_DEFAULT;
 static const float alpha_minimum=0;
 static const float alpha_maximum=10.0;
 static const float beta_minimum=0;
@@ -80,7 +87,7 @@ static int count_non_ready_buffers(struct starpu_task *task, unsigned node)
 	{
 		starpu_data_handle_t handle;
 
-		handle = task->handles[index];
+		handle = STARPU_TASK_GET_HANDLE(task, index);
 
 		int is_valid;
 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);
@@ -281,15 +288,10 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
-/* Sometimes workers didn't take the tasks as early as we expected */
+        /* Sometimes workers didn't take the tasks as early as we expected */
 	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
-	if(!isnan(predicted))
-	{
-		fifo->exp_end += predicted;
-		fifo->exp_len += predicted;
-	}
-	
+
 	if (starpu_timing_now() + predicted_transfer < fifo->exp_end)
 	{
 		/* We may hope that the transfer will be finished by
@@ -309,16 +311,21 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		fifo->exp_len += predicted_transfer;
 	}
 
+	if(!isnan(predicted))
+	{
+		fifo->exp_end += predicted;
+		fifo->exp_len += predicted;
+	}
+
 	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 	task->predicted = predicted;
 	task->predicted_transfer = predicted_transfer;
 
 #ifdef STARPU_USE_TOP
-	if (_starpu_top_status_get())
-		_starpu_top_task_prevision(task, best_workerid,
-			(unsigned long long)(fifo->exp_end-predicted)/1000,
-			(unsigned long long)fifo->exp_end/1000);
+	starpu_top_task_prevision(task, best_workerid,
+				  (unsigned long long)(fifo->exp_end-predicted)/1000,
+				  (unsigned long long)fifo->exp_end/1000);
 #endif /* !STARPU_USE_TOP */
 
 	if (starpu_get_prefetch_flag())
@@ -388,6 +395,17 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 
+		/* Sometimes workers didn't take the tasks as early as we expected */
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
@@ -398,27 +416,40 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			}
 
 			double exp_end;
-			starpu_pthread_mutex_t *sched_mutex;
-			starpu_pthread_cond_t *sched_cond;
-			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
-			/* Sometimes workers didn't take the tasks as early as we expected */
-			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
-			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-			fifo->exp_end = fifo->exp_start + fifo->exp_len;
-			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
-
-
 			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
 			double local_penalty = starpu_task_expected_data_transfer_time(memory_node, task);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
 
+			/*
+			 * This implements a default greedy scheduler for the
+			 * case of tasks which have no performance model, or
+			 * whose performance model is not calibrated yet.
+			 *
+			 * It simply uses the number of tasks already pushed to
+			 * the workers, divided by the relative performance of
+			 * a CPU and of a GPU.
+			 *
+			 * This is always computed, but the ntasks_best
+			 * selection is only really used if the task indeed has
+			 * no performance model, or is not calibrated yet.
+			 */
 			if (ntasks_best == -1
-			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-			    || (!calibrating && isnan(local_length)) /* Not calibrating but this worker is being calibrated */
-			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+			
+			    /* Always compute the greedy decision, at least for
+			     * the tasks with no performance model. */
+			    || (!calibrating && ntasks_end < ntasks_best_end)
+
+			    /* The performance model of this task is not
+			     * calibrated on this worker, try to run it there
+			     * to calibrate it there. */
+			    || (!calibrating && isnan(local_length))
+
+			    /* the performance model of this task is not
+			     * calibrated on this worker either, rather run it
+			     * there if this one is low on scheduled tasks. */
+			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end)
 				)
 			{
 				ntasks_best_end = ntasks_end;
@@ -509,6 +540,15 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
+		/* Sometimes workers didn't take the tasks as early as we expected */
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
 		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 	 	{
 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
@@ -517,15 +557,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				continue;
 			}
 
-			/* Sometimes workers didn't take the tasks as early as we expected */
-			starpu_pthread_mutex_t *sched_mutex;
-			starpu_pthread_cond_t *sched_cond;
-			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-
 			STARPU_ASSERT_MSG(fifo != NULL, "worker %d ctx %d\n", worker, sched_ctx_id);
-			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
-			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len;
 			if (exp_end[worker_ctx][nimpl] > max_exp_end)
 				max_exp_end = exp_end[worker_ctx][nimpl];
@@ -551,10 +583,34 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
+			/*
+			 * This implements a default greedy scheduler for the
+			 * case of tasks which have no performance model, or
+			 * whose performance model is not calibrated yet.
+			 *
+			 * It simply uses the number of tasks already pushed to
+			 * the workers, divided by the relative performance of
+			 * a CPU and of a GPU.
+			 *
+			 * This is always computed, but the ntasks_best
+			 * selection is only really used if the task indeed has
+			 * no performance model, or is not calibrated yet.
+			 */
 			if (ntasks_best == -1
-			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
-			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
-			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+
+			    /* Always compute the greedy decision, at least for
+			     * the tasks with no performance model. */
+			    || (!calibrating && ntasks_end < ntasks_best_end)
+
+			    /* The performance model of this task is not
+			     * calibrated on this worker, try to run it there
+			     * to calibrate it there. */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl]))
+
+			    /* the performance model of this task is not
+			     * calibrated on this worker either, rather run it
+			     * there if this one is low on scheduled tasks. */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end)
 				)
 			{
 				ntasks_best_end = ntasks_end;
@@ -722,64 +778,18 @@ static int dmda_push_sorted_task(struct starpu_task *task)
 #ifdef STARPU_DEVEL
 #warning TODO: after defining a scheduling window, use that instead of empty_ctx_tasks
 #endif
-	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
-	int ret_val = -1;
-
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
-
-	ret_val = _dmda_push_task(task, 1, sched_ctx_id);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-	return ret_val;
-
+	return _dmda_push_task(task, 1, task->sched_ctx);
 }
 
 static int dm_push_task(struct starpu_task *task)
 {
-	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
-	int ret_val = -1;
-
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
-
-	ret_val = _dm_push_task(task, 0, sched_ctx_id);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-	return ret_val;
+	return _dm_push_task(task, 0, task->sched_ctx);
 }
 
 static int dmda_push_task(struct starpu_task *task)
 {
-	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
-	int ret_val = -1;
-
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
-
 	STARPU_ASSERT(task);
-	ret_val = _dmda_push_task(task, 0, sched_ctx_id);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-	return ret_val;
+	return _dmda_push_task(task, 0, task->sched_ctx);
 }
 
 static void dmda_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
@@ -820,9 +830,9 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
 
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)malloc(sizeof(struct _starpu_dmda_data));
-	dt->alpha = _STARPU_DEFAULT_ALPHA;
-	dt->beta = _STARPU_DEFAULT_BETA;
-	dt->_gamma = _STARPU_DEFAULT_GAMMA;
+	dt->alpha = _STARPU_SCHED_ALPHA_DEFAULT;
+	dt->beta = _STARPU_SCHED_BETA_DEFAULT;
+	dt->_gamma = _STARPU_SCHED_GAMMA_DEFAULT;
 	dt->idle_power = 0.0;
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)dt);
@@ -851,13 +861,13 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 
 #ifdef STARPU_USE_TOP
 	starpu_top_register_parameter_float("DMDA_ALPHA", &alpha,
-		alpha_minimum, alpha_maximum, param_modified);
+					    alpha_minimum, alpha_maximum, param_modified);
 	starpu_top_register_parameter_float("DMDA_BETA", &beta,
-		beta_minimum, beta_maximum, param_modified);
+					    beta_minimum, beta_maximum, param_modified);
 	starpu_top_register_parameter_float("DMDA_GAMMA", &_gamma,
-		gamma_minimum, gamma_maximum, param_modified);
+					    gamma_minimum, gamma_maximum, param_modified);
 	starpu_top_register_parameter_float("DMDA_IDLE_POWER", &idle_power,
-		idle_power_minimum, idle_power_maximum, param_modified);
+					    idle_power_minimum, idle_power_maximum, param_modified);
 #endif /* !STARPU_USE_TOP */
 }
 
@@ -933,14 +943,6 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, unsign
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	/* If there is no prediction available, we consider the task has a null length */
-	if (!isnan(predicted))
-	{
-		task->predicted = predicted;
-		fifo->exp_end += predicted;
-		fifo->exp_len += predicted;
-	}
-
-	/* If there is no prediction available, we consider the task has a null length */
 	if (!isnan(predicted_transfer))
 	{
 		if (starpu_timing_now() + predicted_transfer < fifo->exp_end)
@@ -960,6 +962,14 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, unsign
 		fifo->exp_len += predicted_transfer;
 	}
 
+	/* If there is no prediction available, we consider the task has a null length */
+	if (!isnan(predicted))
+	{
+		task->predicted = predicted;
+		fifo->exp_end += predicted;
+		fifo->exp_len += predicted;
+	}
+
 	fifo->ntasks++;
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);

+ 3 - 4
src/sched_policies/deque_queues.c

@@ -19,11 +19,10 @@
 /* Deque queues, ready for use by schedulers */
 
 #include <starpu.h>
-#include <common/config.h>
-#include <core/workers.h>
+#include <starpu_scheduler.h>
 #include <sched_policies/deque_queues.h>
-#include <errno.h>
-#include <common/utils.h>
+
+#include <core/workers.h>
 
 struct _starpu_deque_jobq *_starpu_create_deque(void)
 {

+ 0 - 1
src/sched_policies/deque_queues.h

@@ -20,7 +20,6 @@
 #define __DEQUE_QUEUES_H__
 
 #include <starpu.h>
-#include <common/config.h>
 #include <core/jobs.h>
 
 struct _starpu_deque_jobq

+ 1 - 2
src/sched_policies/detect_combined_workers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013  Université de Bordeaux 1
- * Copyright (C) 2011, 2012       Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013       Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,6 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <common/config.h>
 #include <starpu.h>
 #include <common/utils.h>
 #include <core/workers.h>

+ 2 - 15
src/sched_policies/eager_central_policy.c

@@ -21,8 +21,9 @@
  *	JOB QUEUE.
  */
 
-#include <core/workers.h>
+#include <starpu_scheduler.h>
 #include <sched_policies/fifo_queues.h>
+#include <common/thread.h>
 
 struct _starpu_eager_center_policy_data
 {
@@ -63,18 +64,7 @@ static int push_task_eager_policy(struct starpu_task *task)
  {
 	unsigned sched_ctx_id = task->sched_ctx;
 	struct _starpu_eager_center_policy_data *data = (struct _starpu_eager_center_policy_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
 	int ret_val = -1;
-
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
-
 		
 	_STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	ret_val = _starpu_fifo_push_task(data->fifo, task);
@@ -82,7 +72,6 @@ static int push_task_eager_policy(struct starpu_task *task)
 	starpu_push_task_end(task);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
-
 	/*if there are no tasks block */
 	/* wake people waiting for a task */
 	unsigned worker = 0;
@@ -103,8 +92,6 @@ static int push_task_eager_policy(struct starpu_task *task)
 		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 	}
 
-		
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
 	return ret_val;
 }
 

+ 0 - 13
src/sched_policies/eager_central_priority_policy.c

@@ -109,20 +109,8 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 	struct _starpu_eager_central_prio_data *data = (struct _starpu_eager_central_prio_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 
 	struct _starpu_priority_taskq *taskq = data->taskq;
-
-	/* if the context has no workers return */
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
 	int ret_val = -1;
 	
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
-
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
@@ -153,7 +141,6 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 	}
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
 	return 0;
 }
 

+ 0 - 2
src/sched_policies/fifo_queues.h

@@ -20,8 +20,6 @@
 #define __FIFO_QUEUES_H__
 
 #include <starpu.h>
-#include <common/config.h>
-#include <common/utils.h>
 
 struct _starpu_fifo_taskq
 {

+ 40 - 61
src/sched_policies/parallel_eager.c

@@ -15,12 +15,10 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-
-#include <core/workers.h>
 #include <sched_policies/fifo_queues.h>
-#include <common/barrier.h>
 #include <sched_policies/detect_combined_workers.h>
-#include <core/parallel_task.h>
+#include <starpu_scheduler.h>
+#include <core/workers.h>
 
 struct _starpu_peager_data
 {
@@ -28,12 +26,14 @@ struct _starpu_peager_data
 	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
 
 	int master_id[STARPU_NMAXWORKERS];
+        starpu_pthread_mutex_t policy_mutex;
 };
 
+#define STARPU_NMAXCOMBINED_WORKERS 10
 /* XXX instead of 10, we should use some "MAX combination .."*/
 static int possible_combinations_cnt[STARPU_NMAXWORKERS];
-static int possible_combinations[STARPU_NMAXWORKERS][10];
-static int possible_combinations_size[STARPU_NMAXWORKERS][10];
+static int possible_combinations[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
+static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
 
 
 /*!!!!!!! It doesn't work with several contexts because the combined workers are constructed
@@ -135,6 +135,7 @@ static void initialize_peager_policy(unsigned sched_ctx_id)
 	data->fifo = _starpu_create_fifo();
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
+        _STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
 }
 
 static void deinitialize_peager_policy(unsigned sched_ctx_id)
@@ -146,6 +147,7 @@ static void deinitialize_peager_policy(unsigned sched_ctx_id)
 	_starpu_destroy_fifo(data->fifo);
 
 	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
+        _STARPU_PTHREAD_MUTEX_DESTROY(&data->policy_mutex);
 
 	free(data);
 }
@@ -153,44 +155,24 @@ static void deinitialize_peager_policy(unsigned sched_ctx_id)
 static int push_task_peager_policy(struct starpu_task *task)
 {
 	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
 	int ret_val = -1;
 	
-	/* if the context has no workers return */
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-	
-   	if(nworkers == 0)
-	{
-   		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-		return ret_val;
-	}
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	int worker = 0;
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
-	
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
-	
-	while(workers->has_next(workers, &it))
-	{
-		worker = workers->get_next(workers, &it);
-		int master = data->master_id[worker];
-		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-		if (starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
-		{
-			starpu_pthread_mutex_t *sched_mutex;
-			starpu_pthread_cond_t *sched_cond;
-			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
-			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
-		}
-	}
-	
 	
+	_STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	ret_val = _starpu_fifo_push_task(data->fifo, task);
 	starpu_push_task_end(task);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
+
+        /*if there are no tasks block */
+        /* wake people waiting for a task */
+        int worker = -1;
+        struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+
+        struct starpu_sched_ctx_iterator it;
+        if(workers->init_iterator)
+                workers->init_iterator(workers, &it);
+
 
 	while(workers->has_next(workers, &it))
 	{
@@ -202,12 +184,11 @@ static int push_task_peager_policy(struct starpu_task *task)
 			starpu_pthread_mutex_t *sched_mutex;
 			starpu_pthread_cond_t *sched_cond;
 			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
 			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 		}
 	}
-	
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
 
 	return ret_val;
 }
@@ -220,14 +201,24 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
 	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER)
-		return _starpu_fifo_pop_task(data->fifo, workerid);
+	{
+		struct starpu_task *task = NULL;
+		_STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
+		task = _starpu_fifo_pop_task(data->fifo, workerid);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
+
+		return task;
+	}
 
 	int master = data->master_id[workerid];
 
 	if (master == workerid)
 	{
 		/* The worker is a master */
-		struct starpu_task *task = _starpu_fifo_pop_task(data->fifo, workerid);
+		struct starpu_task *task = NULL;
+		_STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
+		task = _starpu_fifo_pop_task(data->fifo, workerid);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 		if (!task)
 			return NULL;
@@ -266,29 +257,17 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		}
 		else
 		{
-			/* The master needs to dispatch the task between the
-			 * different combined workers */
-			struct _starpu_combined_worker *combined_worker;
-			combined_worker = _starpu_get_combined_worker_struct(best_workerid);
-			int worker_size = combined_worker->worker_size;
-			int *combined_workerid = combined_worker->combined_workerid;
-
-			struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-			j->task_size = worker_size;
-			j->combined_workerid = best_workerid;
-			j->active_task_alias_count = 0;
-
-			//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
-
-			_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-			_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+			starpu_parallel_task_barrier_init(task, best_workerid);
+			int worker_size = 0;
+			int *combined_workerid;
+			starpu_combined_worker_get_description(best_workerid, &worker_size, &combined_workerid);
 
 			/* Dispatch task aliases to the different slaves */
 			for (i = 1; i < worker_size; i++)
 			{
-				struct starpu_task *alias = _starpu_create_task_alias(task);
+				struct starpu_task *alias = starpu_task_dup(task);
 				int local_worker = combined_workerid[i];
-				
+
 				starpu_pthread_mutex_t *sched_mutex;
 				starpu_pthread_cond_t *sched_cond;
 				starpu_worker_get_sched_condition(local_worker, &sched_mutex, &sched_cond);
@@ -303,7 +282,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 			}
 
 			/* The master also manipulated an alias */
-			struct starpu_task *master_alias = _starpu_create_task_alias(task);
+			struct starpu_task *master_alias = starpu_task_dup(task);
 			return master_alias;
 		}
 	}

+ 17 - 40
src/sched_policies/parallel_heft.c

@@ -23,9 +23,7 @@
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
-#include <common/barrier.h>
 #include <sched_policies/detect_combined_workers.h>
-#include <core/parallel_task.h>
 
 #ifndef DBL_MIN
 #define DBL_MIN __DBL_MIN__
@@ -39,6 +37,14 @@
 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static unsigned napplicable_perf_archtypes = 0;
 
+/*
+ * Here are the default values of alpha, beta, gamma
+ */
+
+#define _STARPU_SCHED_ALPHA_DEFAULT 1.0
+#define _STARPU_SCHED_BETA_DEFAULT 1.0
+#define _STARPU_SCHED_GAMMA_DEFAULT 1000.0
+
 struct _starpu_pheft_data
 {
 	double alpha;
@@ -128,33 +134,25 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	}
 	else
 	{
-		/* This is a combined worker so we create task aliases */
-		struct _starpu_combined_worker *combined_worker;
-		combined_worker = _starpu_get_combined_worker_struct(best_workerid);
-		int worker_size = combined_worker->worker_size;
-		int *combined_workerid = combined_worker->combined_workerid;
-
-		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-		j->task_size = worker_size;
-		j->combined_workerid = best_workerid;
-		j->active_task_alias_count = 0;
-
 		/* This task doesn't belong to an actual worker, it belongs
 		 * to a combined worker and thus the scheduler doesn't care
 		 * of its predicted values which are insignificant */
 		task->predicted = 0;
 		task->predicted_transfer = 0;
 
-		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+		starpu_parallel_task_barrier_init(task, best_workerid);
+		int worker_size = 0;
+		int *combined_workerid;
+		starpu_combined_worker_get_description(best_workerid, &worker_size, &combined_workerid);
 
 		/* All cpu workers must be locked at once */
 		_STARPU_PTHREAD_MUTEX_LOCK(&hd->global_push_mutex);
 
+		/* This is a combined worker so we create task aliases */
 		int i;
 		for (i = 0; i < worker_size; i++)
 		{
-			struct starpu_task *alias = _starpu_create_task_alias(task);
+			struct starpu_task *alias = starpu_task_dup(task);
 			int local_worker = combined_workerid[i];
 
 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
@@ -489,36 +487,15 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 static int parallel_heft_push_task(struct starpu_task *task)
 {
 	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
 	int ret_val = -1;
 
 	if (task->priority == STARPU_MAX_PRIO)
 	{
-		_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-                nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-                if(nworkers == 0)
-                {
-                        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-                        return ret_val;
-                }
-
 		ret_val = _parallel_heft_push_task(task, 1, sched_ctx_id);
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-                return ret_val;
-        }
-
-
-	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-        if(nworkers == 0)
-	{
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
                 return ret_val;
         }
 
         ret_val = _parallel_heft_push_task(task, 0, sched_ctx_id);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
 	return ret_val;
 }
 
@@ -575,9 +552,9 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 {
 	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
 	struct _starpu_pheft_data *hd = (struct _starpu_pheft_data*)malloc(sizeof(struct _starpu_pheft_data));
-	hd->alpha = _STARPU_DEFAULT_ALPHA;
-	hd->beta = _STARPU_DEFAULT_BETA;
-	hd->_gamma = _STARPU_DEFAULT_GAMMA;
+	hd->alpha = _STARPU_SCHED_ALPHA_DEFAULT;
+	hd->beta = _STARPU_SCHED_BETA_DEFAULT;
+	hd->_gamma = _STARPU_SCHED_GAMMA_DEFAULT;
 	hd->idle_power = 0.0;
 
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)hd);

+ 1 - 16
src/sched_policies/random_policy.c

@@ -83,22 +83,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 
 static int random_push_task(struct starpu_task *task)
 {
-	unsigned sched_ctx_id = task->sched_ctx;
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-	unsigned nworkers;
-        int ret_val = -1;
-
-        _STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-	nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-        if(nworkers == 0)
-        {
-		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-                return ret_val;
-        }
-
-        ret_val = _random_push_task(task, !!task->priority);
-        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-        return ret_val;
+        return _random_push_task(task, !!task->priority);
 }
 
 static void initialize_random_policy(unsigned sched_ctx_id)

+ 0 - 1
src/sched_policies/stack_queues.h

@@ -20,7 +20,6 @@
 #define __STACK_QUEUES_H__
 
 #include <starpu.h>
-#include <common/config.h>
 #include <core/jobs.h>
 
 struct _starpu_stack_jobq

+ 0 - 15
src/sched_policies/work_stealing_policy.c

@@ -336,19 +336,6 @@ int ws_push_task(struct starpu_task *task)
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	int workerid = starpu_worker_get_id();
 
-	starpu_pthread_mutex_t *changing_ctx_mutex = starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx_id);
-        unsigned nworkers;
-        int ret_val = -1;
-
-	/* if the context has no workers return */
-        _STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
-        nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
-        if(nworkers == 0)
-        {
-                _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-                return ret_val;
-        }
-
 	unsigned worker = 0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 	struct starpu_sched_ctx_iterator it;
@@ -394,8 +381,6 @@ int ws_push_task(struct starpu_task *task)
 		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 	}
 		
-        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
-
 	return 0;
 }
 

+ 0 - 11
src/starpu_parameters.h

@@ -20,17 +20,6 @@
 /* Parameters which are not worth being added to ./configure options, but
  * still interesting to easily change */
 
-/* The dmda scheduling policy uses
- *
- * alpha * T_computation + beta * T_communication + gamma * Consumption
- *
- * Here are the default values of alpha, beta, gamma
- */
-
-#define _STARPU_DEFAULT_ALPHA 1.0
-#define _STARPU_DEFAULT_BETA 1.0
-#define _STARPU_DEFAULT_GAMMA 1000.0
-
 /* How many executions a codelet will have to be measured before we
  * consider that calibration will provide a value good enough for scheduling */
 #define _STARPU_CALIBRATION_MINIMUM 10

+ 2 - 2
src/top/starpu_top_core.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony Roy
- * Copyright (C) 2011, 2012 Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -48,7 +48,7 @@ void __starpu_top_task_prevision_timespec(struct starpu_task *task,
 					int devid,
 					const struct timespec* start,
 					const struct timespec* end);
-void _starpu_top_task_prevision(struct starpu_task *task,
+void starpu_top_task_prevision(struct starpu_task *task,
 			       int devid, unsigned long long start,
 			       unsigned long long end);
 

+ 6 - 3
src/top/starpu_top_task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony Roy
- * Copyright (C) 2011 Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2013 Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -66,17 +66,20 @@ void __starpu_top_task_prevision_timespec(struct starpu_task *task,
 					const struct timespec* start,
 					const struct timespec* end)
 {
-	_starpu_top_task_prevision(task,
+	starpu_top_task_prevision(task,
 				  devid,
 				  _starpu_top_timing_timespec_to_ms(start),
 				  _starpu_top_timing_timespec_to_ms(end));
 }
 
-void _starpu_top_task_prevision(struct starpu_task *task,
+void starpu_top_task_prevision(struct starpu_task *task,
 			       int devid,
 			       unsigned long long start,
 			       unsigned long long end)
 {
+	if (!_starpu_top_status_get())
+		return;
+
 	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
 	STARPU_ASSERT(_starpu_top_status_get());
 	struct timespec now;

+ 2 - 2
src/util/starpu_data_cpy.c

@@ -103,8 +103,8 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 
-	task->handles[0] = dst_handle;
-	task->handles[1] = src_handle;
+	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
+	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 
 	task->synchronous = !asynchronous;
 

+ 12 - 6
src/util/starpu_insert_task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +23,7 @@
 #include <stdarg.h>
 #include <util/starpu_insert_task_utils.h>
 
-void starpu_codelet_pack_args(char **arg_buffer, size_t *arg_buffer_size, ...)
+void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 {
 	va_list varg_list;
 
@@ -32,7 +32,7 @@ void starpu_codelet_pack_args(char **arg_buffer, size_t *arg_buffer_size, ...)
 	*arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
 
 	va_start(varg_list, arg_buffer_size);
-	_starpu_codelet_pack_args(*arg_buffer_size, arg_buffer, varg_list);
+	_starpu_codelet_pack_args(arg_buffer, *arg_buffer_size, varg_list);
 }
 
 void starpu_codelet_unpack_args(void *_cl_arg, ...)
@@ -66,7 +66,7 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 int starpu_insert_task(struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
-	char *arg_buffer = NULL;
+	void *arg_buffer = NULL;
 
 	/* Compute the size */
 	size_t arg_buffer_size = 0;
@@ -76,11 +76,17 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	if (arg_buffer_size)
 	{
 		va_start(varg_list, cl);
-		_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
+		_starpu_codelet_pack_args(&arg_buffer, arg_buffer_size, varg_list);
 	}
 
-	va_start(varg_list, cl);
 	struct starpu_task *task = starpu_task_create();
+
+	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
+	{
+		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+	}
+
+	va_start(varg_list, cl);
 	int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
 
 	if (ret == -ENODEV)

+ 19 - 14
src/util/starpu_insert_task_utils.c

@@ -18,6 +18,7 @@
 #include <util/starpu_insert_task_utils.h>
 #include <common/config.h>
 #include <common/utils.h>
+#include <core/task.h>
 
 typedef void (*_starpu_callback_func_t)(void *);
 
@@ -120,15 +121,16 @@ size_t _starpu_insert_task_get_arg_size(va_list varg_list)
 	return arg_buffer_size;
 }
 
-int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list varg_list)
+int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list)
 {
 	int arg_type;
 	unsigned current_arg_offset = 0;
 	unsigned char nargs = 0;
+	char *_arg_buffer;
 
 	/* The buffer will contain : nargs, {size, content} (x nargs)*/
 
-	*arg_buffer = (char *) malloc(arg_buffer_size);
+	_arg_buffer = malloc(arg_buffer_size);
 
 	/* We will begin the buffer with the number of args (which is stored as a char) */
 	current_arg_offset += sizeof(char);
@@ -150,10 +152,10 @@ int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list
 			void *ptr = va_arg(varg_list, void *);
 			size_t cst_size = va_arg(varg_list, size_t);
 
-			*(size_t *)(&(*arg_buffer)[current_arg_offset]) = cst_size;
+			*(size_t *)(&(_arg_buffer)[current_arg_offset]) = cst_size;
 			current_arg_offset += sizeof(size_t);
 
-			memcpy(&(*arg_buffer)[current_arg_offset], ptr, cst_size);
+			memcpy(&_arg_buffer[current_arg_offset], ptr, cst_size);
 			current_arg_offset += cst_size;
 
 			nargs++;
@@ -205,19 +207,20 @@ int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list
 
 	if (nargs)
 	{
-		(*arg_buffer)[0] = nargs;
+		_arg_buffer[0] = nargs;
 	}
 	else
 	{
-		free(*arg_buffer);
-		*arg_buffer = NULL;
+		free(_arg_buffer);
+		_arg_buffer = NULL;
 	}
 
+	*arg_buffer = _arg_buffer;
 	va_end(varg_list);
 	return 0;
 }
 
-int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list)
+int _starpu_insert_task_create_and_submit(void *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list)
 {
 	int arg_type;
 	unsigned current_buffer = 0;
@@ -239,18 +242,20 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_si
 
 			STARPU_ASSERT(cl != NULL);
 
-			(*task)->handles[current_buffer] = handle;
-			if (cl->modes[current_buffer])
+			STARPU_TASK_SET_HANDLE((*task), handle, current_buffer);
+			if (STARPU_CODELET_GET_MODE(cl, current_buffer))
 			{
-				STARPU_ASSERT_MSG(cl->modes[current_buffer] == mode, "The codelet <%s> defines the access mode %d for the buffer %d which is different from the mode %d given to starpu_insert_task\n",
-						  cl->name, cl->modes[current_buffer], current_buffer, mode);
+				STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(cl, current_buffer) == mode,
+						   "The codelet <%s> defines the access mode %d for the buffer %d which is different from the mode %d given to starpu_insert_task\n",
+						  cl->name, STARPU_CODELET_GET_MODE(cl, current_buffer),
+						  current_buffer, mode);
 			}
 			else
 			{
 #ifdef STARPU_DEVEL
 #  warning shall we print a warning to the user
 #endif
-				cl->modes[current_buffer] = mode;
+				STARPU_CODELET_SET_MODE(cl, mode, current_buffer);
 			}
 
 			current_buffer++;
@@ -264,7 +269,7 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_si
 			int i;
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				(*task)->handles[current_buffer] = handles[i];
+				STARPU_TASK_SET_HANDLE((*task), handles[i], current_buffer);
 				current_buffer++;
 			}
 

+ 4 - 4
src/util/starpu_insert_task_utils.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,9 +22,9 @@
 #include <starpu.h>
 
 size_t _starpu_insert_task_get_arg_size(va_list varg_list);
-int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list varg_list);
-int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
-int _starpu_insert_task_create_and_submit_array(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, starpu_data_handle_t *handles, unsigned nb_handles, va_list varg_list);
+int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list varg_list);
+int _starpu_insert_task_create_and_submit(void *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
+int _starpu_insert_task_create_and_submit_array(void *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, starpu_data_handle_t *handles, unsigned nb_handles, va_list varg_list);
 
 #endif // __STARPU_INSERT_TASK_UTILS_H__
 

+ 1 - 0
tests/Makefile.am

@@ -208,6 +208,7 @@ noinst_PROGRAMS =				\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
+	parallel_tasks/spmd_peager		\
 	perfmodels/regression_based		\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/feed				\

+ 2 - 6
tests/main/insert_task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -93,14 +93,10 @@ int main(int argc, char **argv)
 	task->cl = &mycodelet;
 	task->handles[0] = data_handles[0];
 	task->handles[1] = data_handles[1];
-	char *arg_buffer;
-	size_t arg_buffer_size;
-	starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
+	starpu_codelet_pack_args(&task->cl_arg, &task->cl_arg_size,
 			    STARPU_VALUE, &ifactor, sizeof(ifactor),
 			    STARPU_VALUE, &ffactor, sizeof(ffactor),
 			    0);
-	task->cl_arg = arg_buffer;
-	task->cl_arg_size = arg_buffer_size;
 
 	ret = starpu_task_submit(task);
 	if (ret == -ENODEV) goto enodev;

+ 5 - 3
tools/Makefile.am

@@ -86,7 +86,8 @@ bin_PROGRAMS += 			\
 	starpu_perfmodel_display	\
 	starpu_perfmodel_plot 		\
 	starpu_calibrate_bus		\
-	starpu_machine_display
+	starpu_machine_display		\
+	starpu_lp2paje
 
 starpu_perfmodel_plot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS) $(FXT_CFLAGS)
 
@@ -104,8 +105,6 @@ STARPU_TOOLS	+=			\
 	starpu_perfmodel_plot
 endif
 
-noinst_PROGRAMS =	cbc2paje lp2paje
-
 dist_bin_SCRIPTS +=			\
 	starpu_workers_activity		\
 	starpu_codelet_histo_profile	\
@@ -129,6 +128,8 @@ starpu_perfmodel_display.1: starpu_perfmodel_display$(EXEEXT)
 	help2man --no-discard-stderr -N --output=$@ ./$<
 starpu_perfmodel_plot.1: starpu_perfmodel_plot$(EXEEXT)
 	help2man --no-discard-stderr -N --output=$@ ./$<
+starpu_lp2paje.1: starpu_lp2paje$(EXEEXT)
+	help2man --no-discard-stderr -N --output=$@ ./$<
 starpu_workers_activity.1: starpu_workers_activity$(EXEEXT)
 	chmod +x $<
 	help2man --no-discard-stderr -N --output=$@ ./$<
@@ -153,6 +154,7 @@ dist_man1_MANS =\
 	starpu_machine_display.1 \
 	starpu_perfmodel_display.1 \
 	starpu_perfmodel_plot.1	\
+	starpu_lp2paje.1	\
 	starpu_workers_activity.1 \
 	starpu_codelet_profile.1 \
 	starpu_codelet_histo_profile.1

+ 0 - 156
tools/cbc2paje.c

@@ -1,156 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-struct task {
-	double start;
-	double stop;
-	int worker;
-};
-
-int main(int argc, char *argv[]) {
-	int nw, nt;
-	double tmax;
-	int i, w, t, t2;
-	int foo;
-	double bar;
-	unsigned long num;
-	int b;
-	unsigned long next = 1;
-
-	if (argc != 3) {
-		fprintf(stderr,"usage: %s nb_workers nb_tasks\n", argv[0]);
-		exit(1);
-	}
-	nw = atoi(argv[1]);
-	nt = atoi(argv[2]);
-	fprintf(stderr,"%d workers, %d tasks\n", nw, nt);
-	assert(scanf("Optimal - objective value       %lf", &tmax) == 1);
-	printf(
-"%%EventDef PajeDefineContainerType 1\n"
-"%%  Alias         string\n"
-"%%  ContainerType string\n"
-"%%  Name          string\n"
-"%%EndEventDef\n"
-"%%EventDef PajeCreateContainer     2\n"
-"%%  Time          date\n"
-"%%  Alias         string\n"
-"%%  Type          string\n"
-"%%  Container     string\n"
-"%%  Name          string\n"
-"%%EndEventDef\n"
-"%%EventDef PajeDefineStateType     3\n"
-"%%  Alias         string\n"
-"%%  ContainerType string\n"
-"%%  Name          string\n"
-"%%EndEventDef\n"
-"%%EventDef PajeDestroyContainer    4\n"
-"%%  Time          date\n"
-"%%  Name          string\n"
-"%%  Type          string\n"
-"%%EndEventDef\n"
-"%%EventDef PajeDefineEntityValue 5\n"
-"%%  Alias         string\n"
-"%%  EntityType    string\n"
-"%%  Name          string\n"
-"%%  Color         color\n"
-"%%EndEventDef\n"
-"%%EventDef PajeSetState 6\n"
-"%%  Time          date\n"
-"%%  Type          string\n"
-"%%  Container     string\n"
-"%%  Value         string\n"
-"%%EndEventDef\n"
-"1 W 0 Worker\n"
-);
-	printf("3 S W \"Worker State\"\n");
-	printf("5 S S Running \"0.0 1.0 0.0\"\n");
-	printf("5 F S Idle \"1.0 0.0 0.0\"\n");
-	for (i = 0; i < nw; i++)
-		printf("2 0 W%d W 0 \"%d\"\n", i, i);
-
-	for (w = 0; w < nw; w++)
-		printf("4 %f W%d W\n", tmax, w);
-
-	assert(scanf("%d C%d %lf %lf", &foo, &foo, &tmax, &bar) == 4);
-	next++;
-	{
-		struct task task[nt];
-		memset(&task, 0, sizeof(task));
-		for (t = 0; t < nt; t++) {
-			assert(scanf("%d C%d %lf %lf", &foo, &foo, &task[t].stop, &bar) == 4);
-			next++;
-		}
-
-		while (1) {
-			assert(scanf("%d C%lu", &foo, &num) == 2);
-			if (num >= next +
-
-				/* FIXME */
-				//nw*nt
-				8*20 + 5*16
-
-				) {
-				next+= 8*20+5*16;
-				break;
-			}
-			/* FIXME */
-			if (num-next < 8*20) {
-				t = (num - next) / nw;
-				w = (num - next) % nw;
-			} else {
-				unsigned long nnum = (num-next)-8*20;
-				t = (nnum / 5) + 20;
-				w = (nnum % 5)+3;
-			}
-
-			assert(scanf("%d %lf", &b, &bar) == 2);
-			if (b) {
-				task[t].worker = w;
-				fprintf(stderr,"%lu: task %d on %d: %f\n", num, t, w, task[t].stop);
-			}
-		}
-		while(1) {
-			t = num - next;
-			if (t > nt)
-				break;
-			assert(scanf("%lf %lf", &task[t].start, &bar) == 2);
-			assert(scanf("%d C%lu", &foo, &num) == 2);
-		}
-
-		for (t = 0; t < nt; t++) {
-			printf("6 %f S W%d S\n", task[t].start, task[t].worker);
-			printf("6 %f S W%d F\n", task[t].stop, task[t].worker);
-		}
-
-		for (t = 0; t < nt; t++) {
-			for (t2 = 0; t2 < nt; t2++) {
-				if (t != t2 && task[t].worker == task[t2].worker) {
-					if (!(task[t].start >= task[t2].stop
-					    || task[t2].start >= task[t].stop)) {
-						fprintf(stderr,"oops, %d and %d sharing worker %d !!\n", t, t2, task[t].worker);
-					}
-				}
-			}
-		}
-	}
-
-	return 0;
-}

+ 42 - 47
tools/lp2paje.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,34 +14,48 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#define PROGNAME "starpu_lp2paje"
+
 struct task {
 	double start;
 	double stop;
+	int num;
 	int worker;
 };
 
 int main(int argc, char *argv[]) {
 	int nw, nt;
 	double tmax;
-	int i, w, t, t2;
+	int i, w, ww, t, tt, t2;
 	int foo;
 	double bar;
-	unsigned long num;
-	unsigned long next = 1;
 
-	if (argc != 3) {
-		fprintf(stderr,"usage: %s nb_workers nb_tasks\n", argv[0]);
-		exit(1);
+	if (argc != 1) {
+		if (strcmp(argv[1], "-v") == 0
+		 || strcmp(argv[1], "--version") == 0)
+		{
+			fprintf(stderr, PROGNAME " (" PACKAGE_NAME ") " PACKAGE_VERSION "\n");
+			exit(EXIT_SUCCESS);
+		}
+		fprintf(stderr, "Convert schedule optimized by lp into the Paje format\n\n");
+		fprintf(stderr, "Usage: lp_solve file.lp | %s > paje.trace\n", PROGNAME);
+		fprintf(stderr, "Reports bugs to <"PACKAGE_BUGREPORT">.");
+		fprintf(stderr, "\n");
+		exit(EXIT_SUCCESS);
 	}
-	nw = atoi(argv[1]);
-	nt = atoi(argv[2]);
-	fprintf(stderr,"%d workers, %d tasks\n", nw, nt);
+	scanf("Suboptimal solution\n");
 	assert(scanf("\nValue of objective function: %lf\n", &tmax) == 1);
+
+	assert(scanf("Actual values of the variables:\n") == 0);
+	assert(scanf("tmax %lf\n", &tmax) == 1);
+	assert(scanf("nt %d\n", &nt) == 1);
+	assert(scanf("nw %d\n", &nw) == 1);
 	printf(
 "%%EventDef PajeDefineContainerType 1\n"
 "%%  Alias         string\n"
@@ -80,7 +94,8 @@ int main(int argc, char *argv[]) {
 "1 W 0 Worker\n"
 );
 	printf("3 S W \"Worker State\"\n");
-	printf("5 S S Running \"0.0 1.0 0.0\"\n");
+	for (t = 0; t < nt; t++)
+		printf("5 R%d S Running_%d \"0.0 1.0 0.0\"\n", t, t);
 	printf("5 F S Idle \"1.0 0.0 0.0\"\n");
 	for (i = 0; i < nw; i++)
 		printf("2 0 W%d W 0 \"%d\"\n", i, i);
@@ -88,52 +103,32 @@ int main(int argc, char *argv[]) {
 	for (w = 0; w < nw; w++)
 		printf("4 %f W%d W\n", tmax, w);
 
-	assert(scanf("Actual values of the variables:\n") == 0);
-	assert(scanf("tmax %lf\n", &tmax) == 1);
-	next++;
+	fprintf(stderr,"%d workers, %d tasks\n", nw, nt);
 	{
 		struct task task[nt];
 		memset(&task, 0, sizeof(task));
-		for (t = 0; t < nt; t++) {
+		for (t = nt-1; t >= 0; t--) {
 			assert(scanf("c%d %lf\n", &foo, &task[t].stop) == 2);
-			next++;
 		}
 
-		num = next;
-		while (1) {
-			if (num >= next +
-
-				/* FIXME */
-				//nw*nt
-				8*84 + 5*49
-
-				) {
-				next+= 8*84+5*49;
-				break;
-			}
-			assert(scanf("t%dw%d %lf\n", &foo, &foo, &bar) == 3);
-			/* FIXME */
-			if (num-next < 8*84) {
-				t = (num - next) / nw;
-				w = (num - next) % nw;
-			} else {
-				unsigned long nnum = (num-next)-8*84;
-				t = (nnum / 5) + 84;
-				w = (nnum % 5)+3;
-			}
+		for (t = nt-1; t >= 0; t--)
+			for (w = 0; w < nw; w++) {
+				assert(scanf("t%dw%d %lf\n", &tt, &ww, &bar) == 3);
+				assert(ww == w);
 
-			if (bar > 0.5) {
-				task[t].worker = w;
-				fprintf(stderr,"%lu: task %d on %d: %f\n", num, t, w, task[t].stop);
-			}
-			num++;
+				if (bar > 0.5) {
+					task[t].num = tt;
+					task[t].worker = w;
+				}
 		}
-		for (t = 0; t < nt; t++) {
-			assert(scanf("s%d %lf\n", &foo, &task[t].start) == 2);
+		for (t = nt-1; t >= 0; t--) {
+			assert(scanf("s%d %lf\n", &tt, &task[t].start) == 2);
+			fprintf(stderr,"%d: task %d on %d: %f - %f\n", nt-1-t, tt, task[t].worker, task[t].start, task[t].stop);
+			assert(tt == task[t].num);
 		}
 
 		for (t = 0; t < nt; t++) {
-			printf("6 %f S W%d S\n", task[t].start, task[t].worker);
+			printf("6 %f S W%d R%d\n", task[t].start, task[t].worker, t);
 			printf("6 %f S W%d F\n", task[t].stop, task[t].worker);
 		}
 
@@ -142,7 +137,7 @@ int main(int argc, char *argv[]) {
 				if (t != t2 && task[t].worker == task[t2].worker) {
 					if (!(task[t].start >= task[t2].stop
 					    || task[t2].start >= task[t].stop)) {
-						fprintf(stderr,"oops, %d and %d sharing worker %d !!\n", t, t2, task[t].worker);
+						fprintf(stderr,"oops, %d and %d sharing worker %d !!\n", task[t].num, task[t2].num, task[t].worker);
 					}
 				}
 			}