Browse Source

merge trunk@9615:9720

Nathalie Furmento 12 years ago
parent
commit
c4b5f1398a
100 changed files with 1472 additions and 571 deletions
  1. 24 10
      ChangeLog
  2. 2 2
      TODO
  3. 18 2
      configure.ac
  4. 7 3
      doc/Makefile.am
  5. 4 4
      doc/chapters/advanced-examples.texi
  6. 142 62
      doc/chapters/api.texi
  7. 6 55
      doc/chapters/fft-support.texi
  8. 1 1
      doc/chapters/installing.texi
  9. 26 1
      doc/chapters/mpi-support.texi
  10. 14 6
      doc/chapters/perf-feedback.texi
  11. 10 3
      examples/Makefile.am
  12. 1 1
      examples/basic_examples/dynamic_handles.c
  13. 1 1
      examples/cg/cg_kernels.c
  14. 6 6
      examples/cholesky/cholesky_models.c
  15. 1 1
      examples/filters/custom_mf/custom_interface.c
  16. 8 8
      examples/heat/lu_kernels_model.c
  17. 1 1
      examples/interface/complex_interface.c
  18. 1 1
      examples/lu/lu_example.c
  19. 3 3
      examples/profiling/profiling.c
  20. 1 1
      examples/reductions/dot_product.c
  21. 126 0
      examples/sched_ctx/parallel_code.c
  22. 3 9
      gcc-plugin/examples/stencil5.c
  23. 1 1
      gcc-plugin/include/starpu-gcc/tasks.h
  24. 1 1
      gcc-plugin/src/tasks.c
  25. 2 2
      gcc-plugin/tests/mocks.h
  26. 11 8
      include/starpu_data.h
  27. 6 6
      include/starpu_data_interfaces.h
  28. 27 3
      include/starpu_deprecated_api.h
  29. 1 1
      include/starpu_driver.h
  30. 2 2
      include/starpu_fxt.h
  31. 3 3
      include/starpu_hash.h
  32. 12 12
      include/starpu_perfmodel.h
  33. 9 9
      include/starpu_profiling.h
  34. 2 0
      include/starpu_sched_ctx.h
  35. 8 8
      include/starpu_scheduler.h
  36. 4 4
      include/starpu_task.h
  37. 2 2
      include/starpu_util.h
  38. 6 6
      include/starpu_worker.h
  39. 4 0
      mpi/include/starpu_mpi.h
  40. 394 87
      mpi/src/starpu_mpi.c
  41. 2 2
      mpi/src/starpu_mpi_datatype.c
  42. 16 9
      mpi/src/starpu_mpi_insert_task.c
  43. 12 0
      mpi/src/starpu_mpi_private.c
  44. 5 0
      mpi/src/starpu_mpi_private.h
  45. 1 1
      mpi/tests/insert_task_owner2.c
  46. 1 1
      mpi/tests/insert_task_owner_data.c
  47. 1 1
      mpi/tests/user_defined_datatype_value.h
  48. 6 6
      sc_hypervisor/examples/cholesky/cholesky_models.c
  49. 3 3
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  50. 5 5
      sc_hypervisor/include/sc_hypervisor_policy.h
  51. 2 2
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  52. 2 2
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  53. 2 2
      sc_hypervisor/src/hypervisor_policies/ispeed_policy.c
  54. 4 4
      sc_hypervisor/src/policies_utils/lp_tools.c
  55. 16 16
      sc_hypervisor/src/policies_utils/policy_tools.c
  56. 8 8
      sc_hypervisor/src/sc_hypervisor.c
  57. 1 1
      socl/src/cl_enqueuemapbuffer.c
  58. 1 1
      src/common/fxt.c
  59. 11 11
      src/common/hash.c
  60. 1 1
      src/core/combined_workers.c
  61. 8 8
      src/core/dependencies/data_concurrency.c
  62. 1 1
      src/core/dependencies/data_concurrency.h
  63. 5 5
      src/core/dependencies/implicit_data_deps.c
  64. 2 2
      src/core/dependencies/implicit_data_deps.h
  65. 1 1
      src/core/jobs.c
  66. 3 3
      src/core/jobs.h
  67. 15 15
      src/core/perfmodel/perfmodel.c
  68. 6 6
      src/core/perfmodel/perfmodel.h
  69. 10 10
      src/core/perfmodel/perfmodel_history.c
  70. 6 6
      src/core/perfmodel/perfmodel_print.c
  71. 202 3
      src/core/sched_ctx.c
  72. 21 1
      src/core/sched_ctx.h
  73. 1 1
      src/core/sched_policy.c
  74. 1 1
      src/core/simgrid.c
  75. 1 1
      src/core/simgrid.h
  76. 3 3
      src/core/task.c
  77. 2 2
      src/core/task_bundle.c
  78. 2 2
      src/core/task_bundle.h
  79. 4 4
      src/core/topology.c
  80. 18 15
      src/core/workers.c
  81. 14 5
      src/core/workers.h
  82. 16 16
      src/datawizard/coherency.c
  83. 5 5
      src/datawizard/coherency.h
  84. 3 3
      src/datawizard/data_request.c
  85. 3 3
      src/datawizard/data_request.h
  86. 1 1
      src/datawizard/datastats.c
  87. 1 1
      src/datawizard/filters.c
  88. 7 7
      src/datawizard/footprint.c
  89. 1 1
      src/datawizard/footprint.h
  90. 3 3
      src/datawizard/interfaces/bcsr_interface.c
  91. 3 3
      src/datawizard/interfaces/block_interface.c
  92. 1 1
      src/datawizard/interfaces/coo_interface.c
  93. 1 1
      src/datawizard/interfaces/csr_interface.c
  94. 94 13
      src/datawizard/interfaces/data_interface.c
  95. 1 1
      src/datawizard/interfaces/matrix_interface.c
  96. 1 1
      src/datawizard/interfaces/multiformat_interface.c
  97. 1 1
      src/datawizard/interfaces/variable_interface.c
  98. 1 1
      src/datawizard/interfaces/vector_interface.c
  99. 2 2
      src/datawizard/memalloc.c
  100. 0 0
      src/datawizard/sort_data_handles.c

+ 24 - 10
ChangeLog

@@ -18,6 +18,16 @@ StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 
 New features:
+  * New function starpu_sched_ctx_exec_parallel_code to execute a
+    parallel code on the workers of the given scheduler context
+  * MPI:
+        - New internal communication system : a unique tag called
+	  is now used for all communications, and a system
+	  of hashmaps on each node which stores pending receives has been
+	  implemented. Every message is now coupled with an envelope, sent
+	  before the corresponding data, which allows the receiver to
+	  allocate data correctly, and to submit the matching receive of
+	  the envelope.
 
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================
@@ -131,8 +141,20 @@ Small features:
   * New macro STARPU_RELEASE_VERSION
   * New function starpu_get_version() to return as 3 integers the
     release version of StarPU.
+  * Enable by default data allocation cache
 
 Changes:
+  * Rename all filter functions to follow the pattern
+    starpu_DATATYPE_filter_FILTERTYPE. The script
+    tools/dev/rename_filter.sh is provided to update your existing
+    applications to use new filters function names.
+  * Renaming of diverse functions and datatypes. The script
+    tools/dev/rename.sh is provided to update your existing
+    applications to use the new names. It is also possible to compile
+    with the pkg-config package starpu-1.0 to keep using the old
+    names. It is however recommended to update your code and to use
+    the package starpu-1.1.
+
   * Fix the block filter functions.
   * Fix StarPU-MPI on Darwin.
   * The FxT code can now be used on systems other than Linux.
@@ -165,16 +187,6 @@ Changes:
     instead.
   * StarPU can now use poti to generate paje traces.
   * Rename scheduling policy "parallel greedy" to "parallel eager"
-  * Rename all filter functions to follow the pattern
-    starpu_DATATYPE_filter_FILTERTYPE. The script
-    tools/dev/rename_filter.sh is provided to rename your existing
-    applications using filters.
-  * Rename function starpu_helper_cublas_init to starpu_cublas_init
-  * Rename function starpu_helper_cublas_shutdown to starpu_cublas_shutdown
-  * Rename function starpu_allocate_buffer_on_node to starpu_malloc_on_node
-  * Rename function starpu_free_buffer_on_node to starpu_free_on_node
-  * Rename getter and setter functions for minimum and maximum task
-    priorities
   * starpu_scheduler.h is no longer automatically included by
     starpu.h, it has to be manually included when needed
   * New batch files to run StarPU applications with Microsoft Visual C
@@ -182,6 +194,8 @@ Changes:
     installed version of StarPU. That can also be used to test
     examples using a previous API.
   * Tutorial is installed in ${docdir}/tutorial
+  * Schedulers eager_central_policy, dm and dmda no longer erroneously respect
+    priorities. dmdas has to be used to respect priorities.
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is

+ 2 - 2
TODO

@@ -4,6 +4,6 @@ Moving access modes for data handles from struct starpu_task to struct starpu_co
 
 TODO list
 
-- Make struct starpu_buffer_descr private (or not, as it can still be used in tests and examples)
+- Make struct starpu_data_descr private (or not, as it can still be used in tests and examples)
 
-- When cost_model is provided, but not cost_function, need to rebuild a struct starpu_buffer_descr
+- When cost_model is provided, but not cost_function, need to rebuild a struct starpu_data_descr

+ 18 - 2
configure.ac

@@ -283,9 +283,9 @@ else
    build_sc_hypervisor="no"
 fi
 
-
 AM_CONDITIONAL([STARPU_BUILD_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
+
 ###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #
@@ -1000,6 +1000,15 @@ if test x$use_fxt = xyes; then
 	FXT_LIBS="$FXT_LIBS $POTI_LIBS"
 fi
 
+AC_MSG_CHECKING(whether additional locking systems FxT traces should be enabled)
+AC_ARG_ENABLE(fxt-lock, [AS_HELP_STRING([--enable-fxt-lock],
+			[enable additional locking systems FxT traces])],
+			enable_fxt_lock=$enableval, enable_fxt_lock=no)
+AC_MSG_RESULT($enable_fxt_lock)
+if  test x$enable_fxt_lock = xyes; then
+	AC_DEFINE(STARPU_FXT_LOCK_TRACES, [1], [enable additional locking systems FxT traces])
+fi
+
 AC_MSG_CHECKING(whether performance debugging should be enabled)
 AC_ARG_ENABLE(perf-debug, [AS_HELP_STRING([--enable-perf-debug],
 			[enable performance debugging through gprof])],
@@ -1092,7 +1101,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
 AC_MSG_CHECKING(whether allocation cache should be used)
 AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--enable-allocation-cache],
 			[enable data allocation cache])],
-			enable_allocation_cache=$enableval, enable_allocation_cache=no)
+			enable_allocation_cache=$enableval, enable_allocation_cache=yes)
 AC_MSG_RESULT($enable_allocation_cache)
 if test x$enable_allocation_cache = xyes; then
 	AC_DEFINE(STARPU_USE_ALLOCATION_CACHE, [1], [enable data allocation cache])
@@ -1829,6 +1838,13 @@ m4_ifdef([AM_SILENT_RULES],
 AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
 			enable_build_doc=$enableval, enable_build_doc=yes)
+
+# Check whether texi2dvi is installed
+AC_PATH_PROG(texi2dvicommand, texi2dvi)
+if test "$texi2dvicommand" = "" ; then
+	enable_build_doc="no"
+fi
+
 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 
 ###############################################################################

+ 7 - 3
doc/Makefile.am

@@ -82,8 +82,10 @@ chapters/version.texi: $(chapters)
 	@-for f in $(starpu_TEXINFOS) ; do \
                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp
-	@-LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null
-	@-LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null
+	@if test -s timestamp ; then \
+		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
+		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
+	fi
 	@if test -s timestamp_updated ; then \
 		echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi;\
 		echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi;\
@@ -93,7 +95,9 @@ chapters/version.texi: $(chapters)
 	fi
 	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
 	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
-	@$(RM) timestamp timestamp_updated timestamp_updated_month
+	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
+		if test -f $$f ; then $(RM) $$f ; fi ;\
+	done
 
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c

+ 4 - 4
doc/chapters/advanced-examples.texi

@@ -175,7 +175,7 @@ task->destroy = 0;
 starpu_task_submit(task);
 
 /* The task is finished, get profiling information */
-struct starpu_task_profiling_info *info = task->profiling_info;
+struct starpu_profiling_task_info *info = task->profiling_info;
 
 /* How much time did it take before the task started ? */
 double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
@@ -194,8 +194,8 @@ starpu_task_destroy(task);
 int worker;
 for (worker = 0; worker < starpu_worker_get_count(); worker++)
 @{
-        struct starpu_worker_profiling_info worker_info;
-        int ret = starpu_worker_get_profiling_info(worker, &worker_info);
+        struct starpu_profiling_worker_info worker_info;
+        int ret = starpu_profiling_worker_get_info(worker, &worker_info);
         STARPU_ASSERT(!ret);
 
         double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
@@ -1278,7 +1278,7 @@ the field @code{dyn_handles} when defining a task and the field
 
 @cartouche
 @smallexample
-enum starpu_access_mode modes[STARPU_NMAXBUFS+1] = @{
+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = @{
 	STARPU_R, STARPU_R, ...
 @};
 

+ 142 - 62
doc/chapters/api.texi

@@ -26,6 +26,7 @@
 * OpenCL extensions::
 * Miscellaneous helpers::
 * FXT Support::
+* FFT Support::
 * MPI::
 * Task Bundles::
 * Task Lists::
@@ -60,7 +61,7 @@ Return as 3 integers the release version of StarPU.
 
 @deftp {Data Type} {struct starpu_driver}
 @table @asis
-@item @code{enum starpu_archtype type}
+@item @code{enum starpu_worker_archtype type}
 The type of the driver. Only STARPU_CPU_DRIVER, STARPU_CUDA_DRIVER and
 STARPU_OPENCL_DRIVER are currently supported.
 @item @code{union id} Anonymous union
@@ -312,7 +313,7 @@ return the amount of available memory on the node. Otherwise return
 @node Workers' Properties
 @section Workers' Properties
 
-@deftp {Data Type} {enum starpu_archtype}
+@deftp {Data Type} {enum starpu_worker_archtype}
 The different values are:
 @table @asis
 @item @code{STARPU_CPU_WORKER}
@@ -326,7 +327,7 @@ This function returns the number of workers (i.e. processing units executing
 StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
 @end deftypefun
 
-@deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})
+@deftypefun int starpu_worker_get_count_by_type ({enum starpu_worker_archtype} @var{type})
 Returns the number of workers of the given @var{type}. A positive
 (or @code{NULL}) value is returned in case of success, @code{-EINVAL} indicates that
 the type is not valid otherwise.
@@ -354,7 +355,7 @@ worker (i.e. when called from the application outside a task or a callback), or
 an integer between 0 and @code{starpu_worker_get_count() - 1}.
 @end deftypefun
 
-@deftypefun int starpu_worker_get_ids_by_type ({enum starpu_archtype} @var{type}, int *@var{workerids}, int @var{maxsize})
+@deftypefun int starpu_worker_get_ids_by_type ({enum starpu_worker_archtype} @var{type}, int *@var{workerids}, int @var{maxsize})
 This function gets the list of identifiers of workers with the given
 type. It fills the workerids array with the identifiers of the workers that have the type
 indicated in the first argument. The maxsize argument indicates the size of the
@@ -366,12 +367,12 @@ chosen by the means of the @code{starpu_worker_get_count_by_type} function, or
 by passing a value greater or equal to @code{STARPU_NMAXWORKERS}.
 @end deftypefun
 
-@deftypefun int starpu_worker_get_by_type ({enum starpu_archtype} @var{type}, int @var{num})
+@deftypefun int starpu_worker_get_by_type ({enum starpu_worker_archtype} @var{type}, int @var{num})
 This returns the identifier of the @var{num}-th worker that has the specified type
 @var{type}. If there are no such worker, -1 is returned.
 @end deftypefun
 
-@deftypefun int starpu_worker_get_by_devid ({enum starpu_archtype} @var{type}, int @var{devid})
+@deftypefun int starpu_worker_get_by_devid ({enum starpu_worker_archtype} @var{type}, int @var{devid})
 This returns the identifier of the worker that has the specified type
 @var{type} and devid @var{devid} (which may not be the n-th, if some devices are
 skipped for instance). If there are no such worker, -1 is returned.
@@ -387,7 +388,7 @@ worker was bound; this identifier is either provided by the OS or by the
 @code{hwloc} library in case it is available.
 @end deftypefun
 
-@deftypefun {enum starpu_archtype} starpu_worker_get_type (int @var{id})
+@deftypefun {enum starpu_worker_archtype} starpu_worker_get_type (int @var{id})
 This function returns the type of processing unit associated to a
 worker. The worker identifier is a value returned by the
 @code{starpu_worker_get_id} function). The returned value
@@ -474,7 +475,7 @@ data).
 @node Basic Data Management API
 @subsection Basic Data Management API
 
-@deftp {Data Type} {enum starpu_access_mode}
+@deftp {Data Type} {enum starpu_data_access_mode}
 This datatype describes a data access mode. The different available modes are:
 @table @asis
 @item @code{STARPU_R}: read-only mode.
@@ -602,10 +603,19 @@ codelet, and reduction between per-worker buffers will be done with the
 @var{redux_cl} codelet.
 @end deftypefun
 
+@deftypefun struct starpu_data_interface_ops* starpu_data_get_interface_ops (starpu_data_handle_t @var{handle})
+Get a pointer to the structure describing the different methods used
+to manipulate the given data. See @ref{struct starpu_data_interface_ops} for more details on this structure.
+@end deftypefun
+
+@deftypefun unsigned starpu_data_get_sequential_consistency_flag (starpu_data_handle_t @var{handle})
+Return the sequential consistency flag of the given data.
+@end deftypefun
+
 @node Access registered data from the application
 @subsection Access registered data from the application
 
-@deftypefun int starpu_data_acquire (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode})
+@deftypefun int starpu_data_acquire (starpu_data_handle_t @var{handle}, {enum starpu_data_access_mode} @var{mode})
 The application must call this function prior to accessing registered data from
 main memory outside tasks. StarPU ensures that the application will get an
 up-to-date copy of the data in main memory located where the data was
@@ -624,7 +634,7 @@ tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
 @end deftypefun
 
 
-@deftypefun int starpu_data_acquire_cb (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
+@deftypefun int starpu_data_acquire_cb (starpu_data_handle_t @var{handle}, {enum starpu_data_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
 @code{starpu_data_acquire_cb} is the asynchronous equivalent of
 @code{starpu_data_acquire}. When the data specified in the first argument is
 available in the appropriate access mode, the callback function is executed.
@@ -638,17 +648,17 @@ be called from task callbacks. Upon successful completion, this function
 returns 0.
 @end deftypefun
 
-@deftypefun int starpu_data_acquire_on_node (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode})
+@deftypefun int starpu_data_acquire_on_node (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_data_access_mode} @var{mode})
 This is the same as @code{starpu_data_acquire}, except that the data will be
 available on the given memory node instead of main memory.
 @end deftypefun
 
-@deftypefun int starpu_data_acquire_on_node_cb (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
+@deftypefun int starpu_data_acquire_on_node_cb (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_data_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
 This is the same as @code{starpu_data_acquire_cb}, except that the data will be
 available on the given memory node instead of main memory.
 @end deftypefun
 
-@defmac STARPU_DATA_ACQUIRE_CB (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, code)
+@defmac STARPU_DATA_ACQUIRE_CB (starpu_data_handle_t @var{handle}, {enum starpu_data_access_mode} @var{mode}, code)
 @code{STARPU_DATA_ACQUIRE_CB} is the same as @code{starpu_data_acquire_cb},
 except that the code to be executed in a callback is directly provided as a
 macro parameter, and the data handle is automatically released after it. This
@@ -819,26 +829,26 @@ The different values are:
 @node Accessing Handle
 @subsubsection Handle
 
-@deftypefun {void *} starpu_handle_to_pointer (starpu_data_handle_t @var{handle}, unsigned @var{node})
+@deftypefun {void *} starpu_data_handle_to_pointer (starpu_data_handle_t @var{handle}, unsigned @var{node})
 Return the pointer associated with @var{handle} on node @var{node} or
 @code{NULL} if @var{handle}'s interface does not support this
 operation or data for this handle is not allocated on that node.
 @end deftypefun
 
-@deftypefun {void *} starpu_handle_get_local_ptr (starpu_data_handle_t @var{handle})
+@deftypefun {void *} starpu_data_get_local_ptr (starpu_data_handle_t @var{handle})
 Return the local pointer associated with @var{handle} or @code{NULL}
 if @var{handle}'s interface does not have data allocated locally
 @end deftypefun
 
-@deftypefun {enum starpu_data_interface_id} starpu_handle_get_interface_id (starpu_data_handle_t @var{handle})
+@deftypefun {enum starpu_data_interface_id} starpu_data_get_interface_id (starpu_data_handle_t @var{handle})
 Return the unique identifier of the interface associated with the given @var{handle}.
 @end deftypefun
 
-@deftypefun size_t starpu_handle_get_size (starpu_data_handle_t @var{handle})
+@deftypefun size_t starpu_data_get_size (starpu_data_handle_t @var{handle})
 Return the size of the data associated with @var{handle}
 @end deftypefun
 
-@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr}, {starpu_ssize_t *}@var{count})
+@deftypefun int starpu_data_pack (starpu_data_handle_t @var{handle}, {void **}@var{ptr}, {starpu_ssize_t *}@var{count})
 Execute the packing operation of the interface of the data registered
 at @var{handle} (@pxref{struct starpu_data_interface_ops}). This
 packing operation must allocate a buffer large enough at @var{ptr} and
@@ -851,7 +861,7 @@ would have been allocated. The special value @code{-1} indicates the
 size is yet unknown.
 @end deftypefun
 
-@deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr}, size_t @var{count})
+@deftypefun int starpu_data_unpack (starpu_data_handle_t @var{handle}, {void *}@var{ptr}, size_t @var{count})
 Unpack in @var{handle} the data located at @var{ptr} of size
 @var{count} as described by the interface of the data. The interface
 registered at @var{handle} must define a unpacking operation
@@ -1398,21 +1408,21 @@ completed.
 @end deftypefun
 
 
-@deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
+@deftypefun uint32_t starpu_hash_crc32c_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
 Compute the CRC of a byte buffer seeded by the inputcrc "current
 state". The return value should be considered as the new "current
 state" for future CRC computation. This is used for computing data size
 footprint.
 @end deftypefun
 
-@deftypefun uint32_t starpu_crc32_be (uint32_t @var{input}, uint32_t @var{inputcrc})
+@deftypefun uint32_t starpu_hash_crc32c_be (uint32_t @var{input}, uint32_t @var{inputcrc})
 Compute the CRC of a 32bit number seeded by the inputcrc "current
 state". The return value should be considered as the new "current
 state" for future CRC computation. This is used for computing data size
 footprint.
 @end deftypefun
 
-@deftypefun uint32_t starpu_crc32_string ({char *}@var{str}, uint32_t @var{inputcrc})
+@deftypefun uint32_t starpu_hash_crc32c_string ({char *}@var{str}, uint32_t @var{inputcrc})
 Compute the CRC of a string seeded by the inputcrc "current state".
 The return value should be considered as the new "current state" for
 future CRC computation. This is used for computing data size footprint.
@@ -1888,8 +1898,8 @@ array. The constant argument passed with the @code{cl_arg} field of the
 @code{starpu_task} structure is not counted in this number.  This value should
 not be above @code{STARPU_NMAXBUFS}.
 
-@item @code{enum starpu_access_mode modes[STARPU_NMAXBUFS]}
-Is an array of @code{enum starpu_access_mode}. It describes the
+@item @code{enum starpu_data_access_mode modes[STARPU_NMAXBUFS]}
+Is an array of @code{enum starpu_data_access_mode}. It describes the
 required access modes to the data neeeded by the codelet (e.g.
 @code{STARPU_RW}). The number of entries in this array must be
 specified in the @code{nbuffers} field (defined above), and should not
@@ -1897,8 +1907,8 @@ exceed @code{STARPU_NMAXBUFS}.
 If unsufficient, this value can be set with the @code{--enable-maxbuffers}
 option when configuring StarPU.
 
-@item @code{enum starpu_access_mode *dyn_modes}
-Is an array of @code{enum starpu_access_mode}. It describes the
+@item @code{enum starpu_data_access_mode *dyn_modes}
+Is an array of @code{enum starpu_data_access_mode}. It describes the
 required access modes to the data neeeded by the codelet (e.g.
 @code{STARPU_RW}). The number of entries in this array must be
 specified in the @code{nbuffers} field (defined above).
@@ -1953,12 +1963,12 @@ State of a task, can be either of
 @end table
 @end deftp
 
-@deftp {Data Type} {struct starpu_buffer_descr}
+@deftp {Data Type} {struct starpu_data_descr}
 This type is used to describe a data handle along with an
 access mode.
 @table @asis
 @item @code{starpu_data_handle_t handle} describes a data,
-@item @code{enum starpu_access_mode mode} describes its access mode
+@item @code{enum starpu_data_access_mode mode} describes its access mode
 @end table
 @end deftp
 
@@ -1979,7 +1989,7 @@ describes where the kernel should be executed, and supplies the appropriate
 implementations. When set to @code{NULL}, no code is executed during the tasks,
 such empty tasks can be useful for synchronization purposes.
 
-@item @code{struct starpu_buffer_descr buffers[STARPU_NMAXBUFS]}
+@item @code{struct starpu_data_descr buffers[STARPU_NMAXBUFS]}
 This field has been made deprecated. One should use instead the
 @code{handles} field to specify the handles to the data accessed by
 the task. The access modes are now defined in the @code{mode} field of
@@ -2109,7 +2119,7 @@ executed. This flag must not be set if the destroy flag is set too.
 @item @code{enum starpu_task_status status} (optional)
 Current state of the task.
 
-@item @code{struct starpu_task_profiling_info *profiling_info} (optional)
+@item @code{struct starpu_profiling_task_info *profiling_info} (optional)
 Profiling information for the task.
 
 @item @code{double predicted} (output field)
@@ -2183,7 +2193,7 @@ i-th element of the field @code{dyn_modes} (@pxref{Setting the Data
 Handles for a Task})
 @end defmac
 
-@defmac STARPU_CODELET_SET_MODE ({struct starpu_codelet *}@var{codelet}codelet, {enum starpu_access_mode} @var{mode}, int @var{i})
+@defmac STARPU_CODELET_SET_MODE ({struct starpu_codelet *}@var{codelet}, {enum starpu_data_access_mode} @var{mode}, int @var{i})
 Set the access mode of the i-th data handle of the given codelet.
 If the codelet is defined with a static or dynamic number of handles,
 will either set the i-th element of the field @code{modes} or the
@@ -2202,7 +2212,7 @@ by the task have to be freed by calling
 @code{starpu_task_destroy}.
 @end deftypefun
 
-@deftypefun {struct starpu_task *}starpu_task_dup ({struct starpu_task *}@var{task})
+@deftypefun {struct starpu_task *} starpu_task_dup ({struct starpu_task *}@var{task})
 Allocate a task structure which is the exact duplicate of the given task.
 @end deftypefun
 
@@ -2317,7 +2327,7 @@ The arguments following the codelets can be of the following types:
 @item
 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, followed by the appropriated objects
+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, @code{STARPU_SCHED_CTX} followed by the appropriated objects
 as defined below.
 @end itemize
 
@@ -2370,6 +2380,11 @@ by an amount of floating point operations, as a double. The user may have to
 explicitly cast into double, otherwise parameter passing will not work.
 @end defmac
 
+@defmac STARPU_SCHED_CTX
+this macro is used when calling @code{starpu_insert_task}, and must be followed
+by the id of the scheduling context to which we want to submit the task.
+@end defmac
+
 @deftypefun void starpu_codelet_pack_args ({void **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
 given to a codelet and later unpacked with the function
@@ -2521,7 +2536,7 @@ be set with @code{starpu_data_set_default_sequential_consistency_flag}.
 @node Performance Model API
 @section Performance Model API
 
-@deftp {Data Type} {enum starpu_perf_archtype}
+@deftp {Data Type} {enum starpu_perfmodel_archtype}
 Enumerates the various types of architectures.
 CPU types range within STARPU_CPU_DEFAULT (1 CPU), STARPU_CPU_DEFAULT+1 (2 CPUs), ... STARPU_CPU_DEFAULT + STARPU_MAXCPUS - 1 (STARPU_MAXCPUS CPUs).
 CUDA types range within STARPU_CUDA_DEFAULT (GPU number 0), STARPU_CUDA_DEFAULT + 1 (GPU number 1), ..., STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS - 1 (GPU number STARPU_MAXCUDADEVS - 1).
@@ -2570,7 +2585,7 @@ is the symbol name for the performance model, which will be used as
 file name to store the model. It must be set otherwise the model will
 be ignored.
 
-@item @code{double (*cost_model)(struct starpu_buffer_descr *)}
+@item @code{double (*cost_model)(struct starpu_data_descr *)}
 This field is deprecated. Use instead the @code{cost_function} field.
 
 @item @code{double (*cost_function)(struct starpu_task *, unsigned nimpl)}
@@ -2621,17 +2636,16 @@ Lock to protect concurrency between loading from disk (W), updating the values
 contains information about the performance model of a given arch.
 
 @table @asis
-@item @code{double (*cost_model)(struct starpu_buffer_descr *t)}
+@item @code{double (*cost_model)(struct starpu_data_descr *t)}
 This field is deprecated. Use instead the @code{cost_function} field.
 
-@item @code{double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)}
+@item @code{double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)}
 Used by @code{STARPU_PER_ARCH}, must point to functions which take a task, the
 target arch and implementation number (as mere conveniency, since the array
 is already indexed by these), and must return a task duration estimation in
 micro-seconds.
 
-@item @code{size_t (*size_base)(struct starpu_task *, enum
-starpu_perf_archtype arch, unsigned nimpl)}
+@item @code{size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl)}
 Same as in @ref{struct starpu_perfmodel}, but per-arch, in
 case it depends on the architecture-specific implementation.
 
@@ -2658,15 +2672,15 @@ loads a given performance model. The @var{model} structure has to be completely
 unloads the given model which has been previously loaded through the function @code{starpu_perfmodel_load_symbol}
 @end deftypefun
 
-@deftypefun void starpu_perfmodel_debugfilepath ({struct starpu_perfmodel} *@var{model}, {enum starpu_perf_archtype} @var{arch}, char *@var{path}, size_t @var{maxlen}, unsigned nimpl)
+@deftypefun void starpu_perfmodel_debugfilepath ({struct starpu_perfmodel} *@var{model}, {enum starpu_perfmodel_archtype} @var{arch}, char *@var{path}, size_t @var{maxlen}, unsigned nimpl)
 returns the path to the debugging information for the performance model.
 @end deftypefun
 
-@deftypefun void starpu_perfmodel_get_arch_name ({enum starpu_perf_archtype} @var{arch}, char *@var{archname}, size_t @var{maxlen}, unsigned nimpl)
+@deftypefun void starpu_perfmodel_get_arch_name ({enum starpu_perfmodel_archtype} @var{arch}, char *@var{archname}, size_t @var{maxlen}, unsigned nimpl)
 returns the architecture name for @var{arch}.
 @end deftypefun
 
-@deftypefun {enum starpu_perf_archtype} starpu_worker_get_perf_archtype (int @var{workerid})
+@deftypefun {enum starpu_perfmodel_archtype} starpu_worker_get_perf_archtype (int @var{workerid})
 returns the architecture type of a given worker.
 @end deftypefun
 
@@ -2674,7 +2688,7 @@ returns the architecture type of a given worker.
 prints a list of all performance models on @var{output}.
 @end deftypefun
 
-@deftypefun void starpu_perfmodel_print ({struct starpu_perfmodel *}@var{model}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl}, {char *}@var{parameter}, {uint32_t *}footprint, {FILE *}@var{output})
+@deftypefun void starpu_perfmodel_print ({struct starpu_perfmodel *}@var{model}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{nimpl}, {char *}@var{parameter}, {uint32_t *}footprint, {FILE *}@var{output})
 todo
 @end deftypefun
 
@@ -2694,7 +2708,7 @@ prints the affinity devices on @var{f}.
 prints a description of the topology on @var{f}.
 @end deftypefun
 
-@deftypefun void starpu_perfmodel_update_history ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{cpuid}, unsigned @var{nimpl}, double @var{measured});
+@deftypefun void starpu_perfmodel_update_history ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *}@var{task}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{cpuid}, unsigned @var{nimpl}, double @var{measured});
 This feeds the performance model @var{model} with an explicit measurement
 @var{measured}, in addition to measurements done by StarPU itself. This can be
 useful when the application already has an existing set of measurements done
@@ -2711,7 +2725,7 @@ Thie function sets the profiling status. Profiling is activated by passing
 @code{STARPU_PROFILING_DISABLE} disables profiling. Calling this function
 resets all profiling measurements. When profiling is enabled, the
 @code{profiling_info} field of the @code{struct starpu_task} structure points
-to a valid @code{struct starpu_task_profiling_info} structure containing
+to a valid @code{struct starpu_profiling_task_info} structure containing
 information about the execution of the task.
 
 Negative return values indicate an error, otherwise the previous status is
@@ -2722,12 +2736,12 @@ returned.
 Return the current profiling status or a negative value in case there was an error.
 @end deftypefun
 
-@deftypefun void starpu_set_profiling_id (int @var{new_id})
+@deftypefun void starpu_profiling_set_id (int @var{new_id})
 This function sets the ID used for profiling trace filename. It needs to be
 called before starpu_init.
 @end deftypefun
 
-@deftp {Data Type} {struct starpu_task_profiling_info}
+@deftp {Data Type} {struct starpu_profiling_task_info}
 This structure contains information about the execution of a task. It is
 accessible from the @code{.profiling_info} field of the @code{starpu_task}
 structure if profiling was enabled. The different fields are:
@@ -2788,7 +2802,7 @@ Power consumed by the task, only available in the MoviSim
 @end table
 @end deftp
 
-@deftp {Data Type} {struct starpu_worker_profiling_info}
+@deftp {Data Type} {struct starpu_profiling_worker_info}
 This structure contains the profiling information associated to a
 worker. The different fields are:
 
@@ -2820,7 +2834,7 @@ Power consumed by the worker, only available in the MoviSim
 @end table
 @end deftp
 
-@deftypefun int starpu_worker_get_profiling_info (int @var{workerid}, {struct starpu_worker_profiling_info *}@var{worker_info})
+@deftypefun int starpu_profiling_worker_get_info (int @var{workerid}, {struct starpu_profiling_worker_info *}@var{worker_info})
 Get the profiling info associated to the worker identified by @var{workerid},
 and reset the profiling measurements. If the @var{worker_info} argument is
 NULL, only reset the counters associated to worker @var{workerid}.
@@ -2829,7 +2843,7 @@ Upon successful completion, this function returns 0. Otherwise, a negative
 value is returned.
 @end deftypefun
 
-@deftp {Data Type} {struct starpu_bus_profiling_info}
+@deftp {Data Type} {struct starpu_profiling_bus_info}
 The different fields are:
 @table @asis
 @item @code{struct timespec start_time}
@@ -2846,7 +2860,7 @@ Number of transfers during profiling.
 @end table
 @end deftp
 
-@deftypefun int starpu_bus_get_profiling_info (int @var{busid}, {struct starpu_bus_profiling_info *}@var{bus_info})
+@deftypefun int starpu_bus_get_profiling_info (int @var{busid}, {struct starpu_profiling_bus_info *}@var{bus_info})
 Get the profiling info associated to the worker designated by @var{workerid},
 and reset the profiling measurements. If worker_info is NULL, only reset the
 counters.
@@ -2876,19 +2890,19 @@ Returns the time elapsed between @var{start} and @var{end} in microseconds.
 Converts the given timespec @var{ts} into microseconds.
 @end deftypefun
 
-@deftypefun void starpu_bus_profiling_helper_display_summary (void)
+@deftypefun void starpu_profiling_bus_helper_display_summary (void)
 Displays statistics about the bus on stderr. if the  environment
 variable @code{STARPU_BUS_STATS} is defined. The function is called
 automatically by @code{starpu_shutdown()}.
 @end deftypefun
 
-@deftypefun void starpu_worker_profiling_helper_display_summary (void)
+@deftypefun void starpu_profiling_worker_helper_display_summary (void)
 Displays statistics about the workers on stderr if the environment
 variable @code{STARPU_WORKER_STATS} is defined. The function is called
 automatically by @code{starpu_shutdown()}.
 @end deftypefun
 
-@deftypefun void starpu_memory_display_stats ()
+@deftypefun void starpu_data_display_memory_stats ()
 Display statistics about the current data handles registered within
 StarPU. StarPU must have been configured with the option
 @code{----enable-memory-stats} (@pxref{Memory feedback}).
@@ -3296,6 +3310,58 @@ used to stop it earlier. @code{starpu_fxt_start_profiling} can then be called to
 start recording it again, etc.
 @end deftypefun
 
+@node FFT Support
+@section FFT Support
+
+@deftypefun {void *} starpufft_malloc (size_t @var{n})
+Allocates memory for @var{n} bytes. This is preferred over @code{malloc}, since
+it allocates pinned memory, which allows overlapped transfers.
+@end deftypefun
+
+@deftypefun {void *} starpufft_free (void *@var{p})
+Release memory previously allocated.
+@end deftypefun
+
+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_1d (int @var{n}, int @var{sign}, unsigned @var{flags})
+Initializes a plan for 1D FFT of size @var{n}. @var{sign} can be
+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
+@end deftypefun
+
+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_2d (int @var{n}, int @var{m}, int @var{sign}, unsigned @var{flags})
+Initializes a plan for 2D FFT of size (@var{n}, @var{m}). @var{sign} can be
+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpufft_start (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
+Start an FFT previously planned as @var{p}, using @var{in} and @var{out} as
+input and output. This only submits the task and does not wait for it.
+The application should call @code{starpufft_cleanup} to unregister the data.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpufft_start_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
+Start an FFT previously planned as @var{p}, using data handles @var{in} and
+@var{out} as input and output (assumed to be vectors of elements of the expected
+types). This only submits the task and does not wait for it.
+@end deftypefun
+
+@deftypefun void starpufft_execute (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
+Execute an FFT previously planned as @var{p}, using @var{in} and @var{out} as
+input and output. This submits and waits for the task.
+@end deftypefun
+
+@deftypefun void starpufft_execute_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
+Execute an FFT previously planned as @var{p}, using data handles @var{in} and
+@var{out} as input and output (assumed to be vectors of elements of the expected
+types). This submits and waits for the task.
+@end deftypefun
+
+@deftypefun void starpufft_cleanup (starpufft_plan @var{p})
+Releases data for plan @var{p}, in the @code{starpufft_start} case.
+@end deftypefun
+
+@deftypefun void starpufft_destroy_plan (starpufft_plan @var{p})
+Destroys plan @var{p}, i.e. release all CPU (fftw) and GPU (cufft) resources.
+@end deftypefun
 
 @node MPI
 @section MPI
@@ -3345,6 +3411,16 @@ to the world size. Communications statistics must be enabled
 (@pxref{STARPU_COMM_STATS}).
 @end deftypefun
 
+@deftypefun void starpu_mpi_set_communication_tag (int @var{tag})
+@anchor{starpu_mpi_set_communication_tag}
+Tell StarPU-MPI which MPI tag to use for all its communications.
+@end deftypefun
+
+@deftypefun int starpu_mpi_get_communication_tag (void)
+@anchor{starpu_mpi_get_communication_tag}
+Returns the MPI tag which will be used for all StarPU-MPI communications.
+@end deftypefun
+
 @node Communication
 @subsection Communication
 
@@ -3475,6 +3551,10 @@ to it.
 Returns the last value set by @code{starpu_data_set_rank}.
 @end deftypefun
 
+@deftypefun starpu_data_handle_t starpu_data_get_data_handle_from_tag (int @var{tag})
+Returns the data handle associated to the MPI tag, or NULL if there is not.
+@end deftypefun
+
 @defmac STARPU_EXECUTE_ON_NODE
 this macro is used when calling @code{starpu_mpi_insert_task}, and
 must be followed by a integer value which specified the node on which
@@ -3601,11 +3681,11 @@ This function mustn't be called if @var{bundle} is already closed and/or @var{ta
 Inform the runtime that the user won't modify @var{bundle} anymore, it means no more inserting or removing task. Thus the runtime can destroy it when possible.
 @end deftypefun
 
-@deftypefun double starpu_task_bundle_expected_length (starpu_task_bundle_t @var{bundle}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+@deftypefun double starpu_task_bundle_expected_length (starpu_task_bundle_t @var{bundle}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{nimpl})
 Return the expected duration of the entire task bundle in µs.
 @end deftypefun
 
-@deftypefun double starpu_task_bundle_expected_power (starpu_task_bundle_t @var{bundle}, enum starpu_perf_archtype @var{arch}, unsigned @var{nimpl})
+@deftypefun double starpu_task_bundle_expected_power (starpu_task_bundle_t @var{bundle}, enum starpu_perfmodel_archtype @var{arch}, unsigned @var{nimpl})
 Return the expected power consumption of the entire task bundle in J.
 @end deftypefun
 
@@ -4010,15 +4090,15 @@ Check if the worker specified by workerid can execute the codelet. Schedulers ne
 Return the current date in µs
 @end deftypefun
 
-@deftypefun uint32_t starpu_task_footprint ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *} @var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+@deftypefun uint32_t starpu_task_footprint ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *} @var{task}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{nimpl})
 Returns the footprint for a given task
 @end deftypefun
 
-@deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+@deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{nimpl})
 Returns expected task duration in µs
 @end deftypefun
 
-@deftypefun double starpu_worker_get_relative_speedup ({enum starpu_perf_archtype} @var{perf_archtype})
+@deftypefun double starpu_worker_get_relative_speedup ({enum starpu_perfmodel_archtype} @var{perf_archtype})
 Returns an estimated speedup factor relative to CPU speed
 @end deftypefun
 
@@ -4026,15 +4106,15 @@ Returns an estimated speedup factor relative to CPU speed
 Returns expected data transfer time in µs
 @end deftypefun
 
-@deftypefun double starpu_data_expected_transfer_time (starpu_data_handle_t @var{handle}, unsigned @var{memory_node}, {enum starpu_access_mode} @var{mode})
+@deftypefun double starpu_data_expected_transfer_time (starpu_data_handle_t @var{handle}, unsigned @var{memory_node}, {enum starpu_data_access_mode} @var{mode})
 Predict the transfer time (in µs) to move a handle to a memory node
 @end deftypefun
 
-@deftypefun double starpu_task_expected_power ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+@deftypefun double starpu_task_expected_power ({struct starpu_task *}@var{task}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned @var{nimpl})
 Returns expected power consumption in J
 @end deftypefun
 
-@deftypefun double starpu_task_expected_conversion_time ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned {nimpl})
+@deftypefun double starpu_task_expected_conversion_time ({struct starpu_task *}@var{task}, {enum starpu_perfmodel_archtype} @var{arch}, unsigned {nimpl})
 Returns expected conversion time in ms (multiformat interface only)
 @end deftypefun
 

+ 6 - 55
doc/chapters/fft-support.texi

@@ -9,10 +9,10 @@
 StarPU provides @code{libstarpufft}, a library whose design is very similar to
 both fftw and cufft, the difference being that it takes benefit from both CPUs
 and GPUs. It should however be noted that GPUs do not have the same precision as
-CPUs, so the results may different by a negligible amount
+CPUs, so the results may different by a negligible amount.
 
-float, double and long double precisions are available, with the fftw naming
-convention:
+Different precisions are available, namely float, double and long
+double precisions, with the following fftw naming conventions:
 
 @enumerate
 @item double precision structures and functions are named e.g. @code{starpufft_execute}
@@ -20,7 +20,7 @@ convention:
 @item long double precision structures and functions are named e.g. @code{starpufftl_execute}
 @end enumerate
 
-The documentation below uses names for double precision, replace
+The documentation below is given with names for double precision, replace
 @code{starpufft_} with @code{starpufftf_} or @code{starpufftl_} as appropriate.
 
 Only complex numbers are supported at the moment.
@@ -42,6 +42,8 @@ plan. @code{starpufft_start_handle} is preferrable since it does not wait for
 the task completion, and thus permits to enqueue a series of tasks.
 @end enumerate
 
+All functions are defined in @ref{FFT Support}.
+
 @section Compilation
 
 The flags required to compile or link against the FFT library are accessible
@@ -54,54 +56,3 @@ $ pkg-config --libs starpufft-1.0    # options for the linker
 
 Also pass the @code{--static} option if the application is to be linked statically.
 
-@section Initialisation
-
-@deftypefun {void *} starpufft_malloc (size_t @var{n})
-Allocates memory for @var{n} bytes. This is preferred over @code{malloc}, since
-it allocates pinned memory, which allows overlapped transfers.
-@end deftypefun
-
-@deftypefun {void *} starpufft_free (void *@var{p})
-Release memory previously allocated.
-@end deftypefun
-
-@deftypefun {struct starpufft_plan *} starpufft_plan_dft_1d (int @var{n}, int @var{sign}, unsigned @var{flags})
-Initializes a plan for 1D FFT of size @var{n}. @var{sign} can be
-@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
-@end deftypefun
-
-@deftypefun {struct starpufft_plan *} starpufft_plan_dft_2d (int @var{n}, int @var{m}, int @var{sign}, unsigned @var{flags})
-Initializes a plan for 2D FFT of size (@var{n}, @var{m}). @var{sign} can be
-@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
-@end deftypefun
-
-@deftypefun {struct starpu_task *} starpufft_start (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
-Start an FFT previously planned as @var{p}, using @var{in} and @var{out} as
-input and output. This only submits the task and does not wait for it.
-The application should call @code{starpufft_cleanup} to unregister the data.
-@end deftypefun
-
-@deftypefun {struct starpu_task *} starpufft_start_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
-Start an FFT previously planned as @var{p}, using data handles @var{in} and
-@var{out} as input and output (assumed to be vectors of elements of the expected
-types). This only submits the task and does not wait for it.
-@end deftypefun
-
-@deftypefun void starpufft_execute (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
-Execute an FFT previously planned as @var{p}, using @var{in} and @var{out} as
-input and output. This submits and waits for the task.
-@end deftypefun
-
-@deftypefun void starpufft_execute_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
-Execute an FFT previously planned as @var{p}, using data handles @var{in} and
-@var{out} as input and output (assumed to be vectors of elements of the expected
-types). This submits and waits for the task.
-@end deftypefun
-
-@deftypefun void starpufft_cleanup (starpufft_plan @var{p})
-Releases data for plan @var{p}, in the @code{starpufft_start} case.
-@end deftypefun
-
-@deftypefun void starpufft_destroy_plan (starpufft_plan @var{p})
-Destroys plan @var{p}, i.e. release all CPU (fftw) and GPU (cufft) resources.
-@end deftypefun

+ 1 - 1
doc/chapters/installing.texi

@@ -1,4 +1,4 @@
-w@c -*-texinfo-*-
+@c -*-texinfo-*-
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1

+ 26 - 1
doc/chapters/mpi-support.texi

@@ -148,7 +148,23 @@ creation of a StarPU-MPI request, the function
 @code{starpu_data_acquire_cb} is then called to asynchronously request
 StarPU to fetch the data in main memory; when the data is available in
 main memory, a StarPU-MPI function is called to put the new request in
-the list of the ready requests.
+the list of the ready requests if it is a send request, or in an
+hashmap if it is a receive request.
+
+Internally, all MPI communications submitted by StarPU uses a unique
+tag which has a default value, and can be accessed with the functions
+@ref{starpu_mpi_get_communication_tag} and
+@ref{starpu_mpi_set_communication_tag}.
+
+The matching of tags with corresponding requests is done into StarPU-MPI. 
+To handle this, any communication is a double-communication based on a 
+envelope + data system. Every data which will be sent needs to send an 
+envelope which describes the data (particularly its tag) before sending 
+the data, so the receiver can get the matching pending receive request 
+from the hashmap, and submit it to recieve the data correctly.
+
+To this aim, the StarPU-MPI progression thread has a permanent-submitted 
+request destined to receive incoming envelopes from all sources.
 
 The StarPU-MPI progression thread regularly polls this list of ready
 requests. For each new ready request, the appropriate function is
@@ -162,6 +178,15 @@ requests. For each detached request, it regularly tests the completion
 of the MPI request by calling @code{MPI_Test}. On completion, the data
 handle is released, and if a callback was defined, it is called.
 
+Finally, the StarPU-MPI progression thread checks if an envelope has 
+arrived. If it is, it'll check if the corresponding receive has already
+been submitted by the application. If it is, it'll submit the request
+just as like as it does with those on the list of ready requests.
+If it is not, it'll allocate a temporary handle to store the data that
+will arrive just after, so as when the corresponding receive request
+will be submitted by the application, it'll copy this temporary handle
+into its one instead of submitting a new StarPU-MPI request.
+
 @ref{Communication} gives the list of all the point to point
 communications defined in StarPU-MPI.
 

+ 14 - 6
doc/chapters/perf-feedback.texi

@@ -67,13 +67,13 @@ More details about the performance monitoring API are available in section
 @node Task feedback
 @subsection Per-task feedback
 
-If profiling is enabled, a pointer to a @code{starpu_task_profiling_info}
-structure is put in the @code{.profiling_info} field of the @code{starpu_task}
+If profiling is enabled, a pointer to a @code{struct starpu_profiling_task_info}
+is put in the @code{.profiling_info} field of the @code{starpu_task}
 structure when a task terminates.
 This structure is automatically destroyed when the task structure is destroyed,
 either automatically or by calling @code{starpu_task_destroy}.
 
-The @code{starpu_task_profiling_info} structure indicates the date when the
+The @code{struct starpu_profiling_task_info} indicates the date when the
 task was submitted (@code{submit_time}), started (@code{start_time}), and
 terminated (@code{end_time}), relative to the initialization of
 StarPU with @code{starpu_init}. It also specifies the identifier of the worker
@@ -98,8 +98,8 @@ This array is not reinitialized when profiling is enabled or disabled.
 @node Worker feedback
 @subsection Per-worker feedback
 
-The second argument returned by the @code{starpu_worker_get_profiling_info}
-function is a @code{starpu_worker_profiling_info} structure that gives
+The second argument returned by the @code{starpu_profiling_worker_get_info}
+function is a @code{struct starpu_profiling_worker_info} that gives
 statistics about the specified worker. This structure specifies when StarPU
 started collecting profiling information for that worker (@code{start_time}),
 the duration of the profiling measurement interval (@code{total_time}), the
@@ -110,7 +110,7 @@ These values give an estimation of the proportion of time spent do real work,
 and the time spent either sleeping because there are not enough executable
 tasks or simply wasted in pure StarPU overhead.
 
-Calling @code{starpu_worker_get_profiling_info} resets the profiling
+Calling @code{starpu_profiling_worker_get_info} resets the profiling
 information associated to a worker.
 
 When an FxT trace is generated (see @ref{Generating traces}), it is also
@@ -319,6 +319,14 @@ $ starpu_fxt_tool -i filename1 -i filename2
 By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option @code{-c} to @code{starpu_fxt_tool}.
 
+Traces can also be inspected by hand by using the @code{fxt_print} tool, for instance:
+
+@smallexample
+$ fxt_print -o -f filename
+@end smallexample
+
+Timings are in nanoseconds (while timings as seen in @code{vite} are in milliseconds).
+
 @node DAG
 @subsection Creating a DAG with graphviz
 

+ 10 - 3
examples/Makefile.am

@@ -46,8 +46,9 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
-	sched_ctx_utils/sched_ctx_utils.c		\
-	sched_ctx/sched_ctx.c		\
+	sched_ctx_utils/sched_ctx_utils.c			\
+	sched_ctx/sched_ctx.c					\
+	sched_ctx/parallel_code.c				\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -181,6 +182,7 @@ examplebin_PROGRAMS +=				\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	sched_ctx/sched_ctx			\
+	sched_ctx/parallel_code			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	mandelbrot/mandelbrot			\
@@ -253,7 +255,8 @@ STARPU_EXAMPLES +=				\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
-	sched_ctx/sched_ctx				\
+	sched_ctx/sched_ctx			\
+	sched_ctx/parallel_code			\
 	reductions/dot_product			\
 	reductions/minmax_reduction
 
@@ -886,6 +889,10 @@ examplebin_PROGRAMS +=		\
 
 openmp_vector_scal_omp_CFLAGS = \
 	$(AM_CFLAGS) -fopenmp
+
+sched_ctx_parallel_code_CFLAGS = \
+	$(AM_CFLAGS) -fopenmp
+
 endif
 
 showcheck:

+ 1 - 1
examples/basic_examples/dynamic_handles.c

@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	dummy_big_cl.dyn_modes = malloc(dummy_big_cl.nbuffers * sizeof(enum starpu_access_mode));
+	dummy_big_cl.dyn_modes = malloc(dummy_big_cl.nbuffers * sizeof(enum starpu_data_access_mode));
 	for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 	     dummy_big_cl.dyn_modes[i] = STARPU_RW;
 

+ 1 - 1
examples/cg/cg_kernels.c

@@ -46,7 +46,7 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
-	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
 	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 

+ 6 - 6
examples/cholesky/cholesky_models.c

@@ -36,7 +36,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -51,7 +51,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -66,7 +66,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -81,7 +81,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -96,7 +96,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -111,7 +111,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 

+ 1 - 1
examples/filters/custom_mf/custom_interface.c

@@ -237,7 +237,7 @@ static size_t custom_interface_get_size(starpu_data_handle_t handle)
 
 static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(custom_get_nx(handle), 0);
+	return starpu_hash_crc32c_be(custom_get_nx(handle), 0);
 }
 
 static void display_custom_interface(starpu_data_handle_t handle, FILE *f)

+ 8 - 8
examples/heat/lu_kernels_model.c

@@ -102,7 +102,7 @@ double task_22_cost(struct starpu_task *task, unsigned nimpl)
  */
 
 
-double task_11_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -114,7 +114,7 @@ double task_11_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arc
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -127,7 +127,7 @@ double task_12_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arc
 }
 
 
-double task_21_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -141,7 +141,7 @@ double task_21_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arc
 
 
 
-double task_22_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 
@@ -161,7 +161,7 @@ double task_22_cost_cuda(struct starpu_task *task, enum starpu_perf_archtype arc
  *
  */
 
-double task_11_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -173,7 +173,7 @@ double task_11_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -186,7 +186,7 @@ double task_12_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch
 }
 
 
-double task_21_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -200,7 +200,7 @@ double task_21_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch
 
 
 
-double task_22_cost_cpu(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 

+ 1 - 1
examples/interface/complex_interface.c

@@ -105,7 +105,7 @@ static size_t complex_get_size(starpu_data_handle_t handle)
 
 static uint32_t complex_footprint(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
+	return starpu_hash_crc32c_be(starpu_complex_get_nx(handle), 0);
 }
 
 static void *complex_handle_to_pointer(starpu_data_handle_t handle, unsigned node)

+ 1 - 1
examples/lu/lu_example.c

@@ -369,7 +369,7 @@ int main(int argc, char **argv)
 	{
 		FPRINTF(stderr, "Setting profile\n");
 		starpu_profiling_status_set(STARPU_PROFILING_DISABLE);
-		starpu_bus_profiling_helper_display_summary();
+		starpu_profiling_bus_helper_display_summary();
 	}
 
 	if (bound)

+ 3 - 3
examples/profiling/profiling.c

@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 	for (i = 0; i < niter; i++)
 	{
 		struct starpu_task *task = tasks[i];
-		struct starpu_task_profiling_info *info = task->profiling_info;
+		struct starpu_profiling_task_info *info = task->profiling_info;
 
 		/* How much time did it take before the task started ? */
 		delay_sum += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
@@ -110,8 +110,8 @@ int main(int argc, char **argv)
 	unsigned worker;
 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
 	{
-		struct starpu_worker_profiling_info worker_info;
-		ret = starpu_worker_get_profiling_info(worker, &worker_info);
+		struct starpu_profiling_worker_info worker_info;
+		ret = starpu_profiling_worker_get_info(worker, &worker_info);
 		STARPU_ASSERT(!ret);
 
 		double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);

+ 1 - 1
examples/reductions/dot_product.c

@@ -44,7 +44,7 @@ static starpu_data_handle_t _dot_handle;
 
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
-	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
 	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 

+ 126 - 0
examples/sched_ctx/parallel_code.c

@@ -0,0 +1,126 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 64
+#else
+#define NTASKS 1000
+#endif
+
+int tasks_executed = 0;
+starpu_pthread_mutex_t mut;
+
+static void sched_ctx_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	starpu_pthread_mutex_lock(&mut);
+	tasks_executed++;
+	starpu_pthread_mutex_unlock(&mut);
+}
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = {sched_ctx_func, NULL},
+	.opencl_funcs = {sched_ctx_func, NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
+int parallel_code(int nprocs)
+{
+	int i;
+	int tasks = 0;
+#pragma omp parallel for num_threads(nprocs)
+	for (i = 0; i < NTASKS; i++) 
+		tasks++;
+
+	return tasks;
+}
+
+int main(int argc, char **argv)
+{
+	int ntasks = NTASKS;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int procs1[20], procs2[20];
+	procs1[0] = 0;
+	procs2[0] = 0;
+
+#ifdef STARPU_USE_CPU
+	unsigned ncpus =  starpu_cpu_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	nprocs1 = ncpus/2;
+	nprocs2 =  nprocs1;
+	int j, k = 0;
+	for(j = nprocs1; j < nprocs1+nprocs2; j++)
+		procs2[k++] = j;
+#endif
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", procs1, nprocs1, "ctx1");
+	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", procs2, nprocs2, "ctx2");
+
+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
+	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = NULL;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+	starpu_sched_ctx_finished_submit(sched_ctx1);
+
+	/* execute an openmp code */
+	int ret_ntasks = (int)starpu_sched_ctx_exec_parallel_code((void*)parallel_code, (void*)nprocs2, sched_ctx2);
+	starpu_sched_ctx_finished_submit(sched_ctx2);
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed, ntasks);
+	printf("ctx%d: tasks openmp executed %d out of %d\n", sched_ctx2, ret_ntasks, NTASKS);
+	starpu_shutdown();
+
+	return 0;
+}

+ 3 - 9
gcc-plugin/examples/stencil5.c

@@ -34,15 +34,9 @@ static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const
 	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
 }
 
-#ifdef STARPU_QUICK_CHECK
-#  define NITER_DEF	5
-#  define X         	3
-#  define Y         	3
-#else
-#  define NITER_DEF	500
-#  define X         	20
-#  define Y         	20
-#endif
+#define NITER_DEF	10
+#define X         	4
+#define Y         	4
 
 int display = 0;
 int niter = NITER_DEF;

+ 1 - 1
gcc-plugin/include/starpu-gcc/tasks.h

@@ -44,7 +44,7 @@ extern tree task_implementation_list (const_tree task_decl);
 extern tree task_pointer_parameter_types (const_tree task_decl);
 extern int task_where (const_tree task_decl);
 extern tree task_implementation_wrapper (const_tree task_impl);
-extern enum starpu_access_mode access_mode (const_tree type);
+extern enum starpu_data_access_mode access_mode (const_tree type);
 extern bool output_type_p (const_tree type);
 
 extern tree codelet_type (void);

+ 1 - 1
gcc-plugin/src/tasks.c

@@ -234,7 +234,7 @@ codelet_type (void)
 
 /* Return the access mode for POINTER, a PARM_DECL of a task.  */
 
-enum starpu_access_mode
+enum starpu_data_access_mode
 access_mode (const_tree type)
 {
   gcc_assert (POINTER_TYPE_P (type));

+ 2 - 2
gcc-plugin/tests/mocks.h

@@ -264,7 +264,7 @@ starpu_data_lookup (const void *ptr)
 }
 
 void *
-starpu_handle_get_local_ptr (starpu_data_handle_t handle)
+starpu_data_get_local_ptr (starpu_data_handle_t handle)
 {
   return handle_to_pointer (handle);
 }
@@ -347,7 +347,7 @@ struct data_acquire_arguments expected_acquire_arguments;
 struct data_release_arguments expected_release_arguments;
 
 int
-starpu_data_acquire (starpu_data_handle_t handle, enum starpu_access_mode mode)
+starpu_data_acquire (starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
   /* XXX: Currently only `STARPU_RW'.  */
   assert (mode == STARPU_RW);

+ 11 - 8
include/starpu_data.h

@@ -28,7 +28,7 @@ extern "C"
 struct _starpu_data_state;
 typedef struct _starpu_data_state* starpu_data_handle_t;
 
-enum starpu_access_mode
+enum starpu_data_access_mode
 {
 	STARPU_NONE=0,
 	STARPU_R=(1<<0),
@@ -38,10 +38,10 @@ enum starpu_access_mode
 	STARPU_REDUX=(1<<3)
 };
 
-struct starpu_buffer_descr
+struct starpu_data_descr
 {
 	starpu_data_handle_t handle;
-	enum starpu_access_mode mode;
+	enum starpu_data_access_mode mode;
 };
 
 struct starpu_data_interface_ops;
@@ -65,10 +65,10 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle);
 
 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 
-int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode);
-int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_access_mode mode);
-int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_access_mode mode, void (*callback)(void *), void *arg);
-int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, enum starpu_access_mode mode, void (*callback)(void *), void *arg);
+int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
+int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode);
+int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
+int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
 #ifdef __GCC__
 #  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
 	{ \						\
@@ -85,7 +85,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 void starpu_data_release(starpu_data_handle_t handle);
 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
-void starpu_memory_display_stats();
+void starpu_data_display_memory_stats();
 
 /* XXX These macros are provided to avoid breaking old codes. But consider
  * these function names as deprecated. */
@@ -117,6 +117,7 @@ enum starpu_node_kind starpu_node_get_kind(unsigned node);
 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);
 
 void starpu_data_set_sequential_consistency_flag(starpu_data_handle_t handle, unsigned flag);
+unsigned starpu_data_get_sequential_consistency_flag(starpu_data_handle_t handle);
 unsigned starpu_data_get_default_sequential_consistency_flag(void);
 void starpu_data_set_default_sequential_consistency_flag(unsigned flag);
 
@@ -132,6 +133,8 @@ int starpu_data_get_rank(starpu_data_handle_t handle);
 
 int starpu_data_set_tag(starpu_data_handle_t handle, int tag);
 int starpu_data_get_tag(starpu_data_handle_t handle);
+starpu_data_handle_t starpu_data_get_data_handle_from_tag(int tag);
+struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_handle_t handle);
 
 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, unsigned memory_node);
 

+ 6 - 6
include/starpu_data_interfaces.h

@@ -142,11 +142,11 @@ void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_hand
 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
  * interface does not support this operation or data for this handle is not
  * allocated on that node. */
-void *starpu_handle_to_pointer(starpu_data_handle_t handle, unsigned node);
+void *starpu_data_handle_to_pointer(starpu_data_handle_t handle, unsigned node);
 
 /* Return the local pointer associated with HANDLE or NULL if HANDLE's
  * interface does not have data allocated locally */
-void *starpu_handle_get_local_ptr(starpu_data_handle_t handle);
+void *starpu_data_get_local_ptr(starpu_data_handle_t handle);
 
 /* "node" means memory node: 0 for main RAM, then 1, 2, etc. for various GPUs,
  * etc.
@@ -424,11 +424,11 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, unsigned hom
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
 
-enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
+enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle);
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);
-int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr, size_t count);
-size_t starpu_handle_get_size(starpu_data_handle_t handle);
+int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);
+int starpu_data_unpack(starpu_data_handle_t handle, void *ptr, size_t count);
+size_t starpu_data_get_size(starpu_data_handle_t handle);
 
 /* Lookup a ram pointer into a StarPU handle */
 extern starpu_data_handle_t starpu_data_lookup(const void *ptr);

+ 27 - 3
include/starpu_deprecated_api.h

@@ -47,13 +47,13 @@ typedef struct starpu_multiformat_interface starpu_multiformat_interface_t;
 #define starpu_sched_policy_s starpu_sched_policy
 #define starpu_data_interface_ops_t starpu_data_interface_ops
 
-typedef struct starpu_buffer_descr starpu_buffer_descr;
+typedef struct starpu_data_descr starpu_buffer_descr;
 typedef struct starpu_codelet starpu_codelet;
 typedef struct starpu_codelet starpu_codelet_t;
-typedef enum starpu_access_mode starpu_access_mode;
+typedef enum starpu_data_access_mode starpu_access_mode;
 
 #define starpu_print_bus_bandwidth     starpu_bus_print_bandwidth
-#define starpu_get_handle_interface_id starpu_handle_get_interface_id
+#define starpu_get_handle_interface_id starpu_data_get_interface_id
 #define starpu_get_current_task        starpu_task_get_current
 #define starpu_unpack_cl_args          starpu_codelet_unpack_args
 #define starpu_pack_cl_args   	       starpu_codelet_pack_args
@@ -90,6 +90,30 @@ typedef enum starpu_access_mode starpu_access_mode;
 
 #define starpu_display_codelet_stats		starpu_codelet_display_stats
 
+#define starpu_access_mode				starpu_data_access_mode
+#define starpu_buffer_descr				starpu_data_descr
+#define starpu_data_display_memory_stats			starpu_data_display_memory_stats
+#define starpu_data_handle_to_pointer			starpu_data_handle_to_pointer
+#define starpu_data_get_local_ptr			starpu_data_get_local_ptr
+#define starpu_hash_crc32c_be_n				starpu_hash_crc32c_be_n
+#define starpu_hash_crc32c_be					starpu_hash_crc32c_be
+#define starpu_hash_crc32c_string				starpu_hash_crc32c_string
+#define starpu_perf_archtype				starpu_perfmodel_archtype
+#define starpu_permodel_history_based_expected_perf		starpu_permodel_history_based_expected_perf
+#define starpu_task_profiling_info			starpu_profiling_task_info
+#define starpu_worker_profiling_info			starpu_profiling_worker_info
+#define starpu_bus_profiling_info			starpu_profiling_bus_info
+#define starpu_profiling_set_id				starpu_profiling_set_id
+#define starpu_profiling_worker_get_info		starpu_profiling_worker_get_info
+#define starpu_profiling_bus_helper_display_summary	starpu_profiling_bus_helper_display_summary
+#define starpu_profiling_worker_helper_display_summary	starpu_profiling_worker_helper_display_summary
+#define starpu_archtype					starpu_worker_archtype
+
+#define starpu_data_get_interface_id		starpu_data_get_interface_id
+#define starpu_data_get_size			starpu_data_get_size
+#define starpu_data_pack			starpu_data_pack
+#define starpu_data_unpack		starpu_data_unpack
+
 #endif /* STARPU_USE_DEPRECATED_ONE_ZERO_API */
 
 #ifdef __cplusplus

+ 1 - 1
include/starpu_driver.h

@@ -30,7 +30,7 @@ extern "C"
 
 struct starpu_driver
 {
-	enum starpu_archtype type;
+	enum starpu_worker_archtype type;
 	union
 	{
 		unsigned cpu_id;

+ 2 - 2
include/starpu_fxt.h

@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 {
 	char symbol[256]; /* name of the codelet */
 	int workerid;
-	enum starpu_perf_archtype archtype;
+	enum starpu_perfmodel_archtype archtype;
 	uint32_t hash;
 	size_t size;
 	float time;
@@ -60,7 +60,7 @@ struct starpu_fxt_options
 	 */
 
 	char worker_names[STARPU_NMAXWORKERS][256];
-	enum starpu_perf_archtype worker_archtypes[STARPU_NMAXWORKERS];
+	enum starpu_perfmodel_archtype worker_archtypes[STARPU_NMAXWORKERS];
 	int nworkers;
 
 	/* In case we want to dump the list of codelets to an external tool */

+ 3 - 3
include/starpu_hash.h

@@ -29,17 +29,17 @@ extern "C"
 /* Compute the CRC of a byte buffer seeded by the inputcrc "current state".
  * The return value should be considered as the new "current state" for future
  * CRC computation. */
-uint32_t starpu_crc32_be_n(void *input, size_t n, uint32_t inputcrc);
+uint32_t starpu_hash_crc32c_be_n(void *input, size_t n, uint32_t inputcrc);
 
 /* Compute the CRC of a 32bit number seeded by the inputcrc "current state".
  * The return value should be considered as the new "current state" for future
  * CRC computation. */
-uint32_t starpu_crc32_be(uint32_t input, uint32_t inputcrc);
+uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc);
 
 /* Compute the CRC of a string seeded by the inputcrc "current state".  The
  * return value should be considered as the new "current state" for future CRC
  * computation. */
-uint32_t starpu_crc32_string(char *str, uint32_t inputcrc);
+uint32_t starpu_hash_crc32c_string(char *str, uint32_t inputcrc);
 
 #ifdef __cplusplus
 }

+ 12 - 12
include/starpu_perfmodel.h

@@ -30,7 +30,7 @@ extern "C"
 #endif
 
 struct starpu_task;
-struct starpu_buffer_descr;
+struct starpu_data_descr;
 
 /*
    it is possible that we have multiple versions of the same kind of workers,
@@ -38,7 +38,7 @@ struct starpu_buffer_descr;
    so we do not use the archtype enum type directly for performance models
 */
 
-enum starpu_perf_archtype
+enum starpu_perfmodel_archtype
 {
 	STARPU_CPU_DEFAULT = 0,
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
@@ -142,9 +142,9 @@ struct starpu_perfmodel_history_table;
 
 struct starpu_perfmodel_per_arch
 {
-	double (*cost_model)(struct starpu_buffer_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
-	double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl); /* returns expected duration in µs */
-	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
+	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
+	double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl); /* returns expected duration in µs */
+	size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 
 	/* internal variables */
 	struct starpu_perfmodel_history_table *history;
@@ -170,7 +170,7 @@ struct starpu_perfmodel
 	enum starpu_perfmodel_type type;
 
 	/* single cost model (STARPU_COMMON), returns expected duration in µs */
-	double (*cost_model)(struct starpu_buffer_descr *) STARPU_DEPRECATED;
+	double (*cost_model)(struct starpu_data_descr *) STARPU_DEPRECATED;
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
@@ -188,22 +188,22 @@ struct starpu_perfmodel
 	starpu_pthread_rwlock_t model_rwlock;
 };
 
-enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
+enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid);
 
 /* This function is intended to be used by external tools that should read the
  * performance model files */
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 
-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
 
-double starpu_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint);
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint);
 int starpu_perfmodel_list(FILE *output);
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
+void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
 
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);

+ 9 - 9
include/starpu_profiling.h

@@ -30,7 +30,7 @@ extern "C"
 #define STARPU_PROFILING_DISABLE	0
 #define STARPU_PROFILING_ENABLE		1
 
-struct starpu_task_profiling_info
+struct starpu_profiling_task_info
 {
 	/* Task submission */
 	struct timespec submit_time;
@@ -65,8 +65,8 @@ struct starpu_task_profiling_info
 	double power_consumed;
 };
 
-/* The timing is provided since the previous call to starpu_worker_get_profiling_info */
-struct starpu_worker_profiling_info
+/* The timing is provided since the previous call to starpu_profiling_worker_get_info */
+struct starpu_profiling_worker_info
 {
 	struct timespec start_time;
 	struct timespec total_time;
@@ -79,7 +79,7 @@ struct starpu_worker_profiling_info
 	double power_consumed;
 };
 
-struct starpu_bus_profiling_info
+struct starpu_profiling_bus_info
 {
 	struct timespec start_time;
 	struct timespec total_time;
@@ -88,7 +88,7 @@ struct starpu_bus_profiling_info
 };
 
 /* This function sets the ID used for profiling trace filename */
-void starpu_set_profiling_id(int new_id);
+void starpu_profiling_set_id(int new_id);
 
 /* This function sets the profiling status:
  * - enable with STARPU_PROFILING_ENABLE
@@ -114,14 +114,14 @@ extern int _starpu_profiling;
 
 /* Get the profiling info associated to a worker, and reset the profiling
  * measurements. If worker_info is NULL, we only reset the counters. */
-int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);
+int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 
 int starpu_bus_get_count(void);
 int starpu_bus_get_id(int src, int dst);
 int starpu_bus_get_src(int busid);
 int starpu_bus_get_dst(int busid);
 
-int starpu_bus_get_profiling_info(int busid, struct starpu_bus_profiling_info *bus_info);
+int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 
 /* Some helper functions to manipulate profiling API output */
 /* Reset timespec */
@@ -182,8 +182,8 @@ static __starpu_inline void starpu_timespec_sub(const struct timespec *a,
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
-void starpu_bus_profiling_helper_display_summary(void);
-void starpu_worker_profiling_helper_display_summary(void);
+void starpu_profiling_bus_helper_display_summary(void);
+void starpu_profiling_worker_helper_display_summary(void);
 
 #ifdef __cplusplus
 }

+ 2 - 0
include/starpu_sched_ctx.h

@@ -181,6 +181,8 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
  * statically allocate tasks with a default priority. */
 #define STARPU_DEFAULT_PRIO	0
 
+/* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
+void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id);
 
 #ifdef __cplusplus
 }

+ 8 - 8
include/starpu_scheduler.h

@@ -125,25 +125,25 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
  */
 
 /* Returns the perfmodel footprint for the task */
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 /* Returns expected task duration in us */
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
-double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
+double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype);
 /* Returns expected data transfer time in us */
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
 /* Predict the transfer time (in us) to move a handle to a memory node */
-double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
+double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
 /* Returns expected power consumption in J */
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 /* Returns expected conversion time in ms (multiformat interface only) */
-double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 /* Return the expected duration of the entire task bundle in us. */
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 /* Return the time (in us) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 
 #ifdef __cplusplus
 }

+ 4 - 4
include/starpu_task.h

@@ -95,8 +95,8 @@ struct starpu_codelet
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 	/* which are the access modes for these buffers */
-	enum starpu_access_mode modes[STARPU_NMAXBUFS];
-	enum starpu_access_mode *dyn_modes;
+	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
+	enum starpu_data_access_mode *dyn_modes;
 
 	/* performance model of the codelet */
 	struct starpu_perfmodel *model;
@@ -117,7 +117,7 @@ struct starpu_task
 	struct starpu_codelet *cl;
 
 	/* arguments managed by the DSM */
-	struct starpu_buffer_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
+	struct starpu_data_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
 
@@ -176,7 +176,7 @@ struct starpu_task
 
 	/* This gets filled when profiling is enabled by using
 	 * starpu_profiling_status_set */
-	struct starpu_task_profiling_info *profiling_info;
+	struct starpu_profiling_task_info *profiling_info;
 
 	/* Predicted duration of the task in µs. This field is only valid if the
 	 * scheduling strategy uses performance models. */

+ 2 - 2
include/starpu_util.h

@@ -68,8 +68,8 @@ extern "C"
 #define STARPU_MAX(a,b)	((a)<(b)?(b):(a))
 
 #ifdef STARPU_NO_ASSERT
-#define STARPU_ASSERT(x)		do { (void) (x);} while(0)
-#define STARPU_ASSERT_MSG(x, msg, ...)	do { (void) (x);} while(0)
+#define STARPU_ASSERT(x)		do { } while(0)
+#define STARPU_ASSERT_MSG(x, msg, ...)	do { } while(0)
 #else
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) *(int*)NULL = 0; } while(0)

+ 6 - 6
include/starpu_worker.h

@@ -31,7 +31,7 @@ extern "C"
 {
 #endif
 
-enum starpu_archtype
+enum starpu_worker_archtype
 {
 	STARPU_ANY_WORKER,    /* any worker, used in the hypervisor */
 	STARPU_CPU_WORKER,    /* CPU core */
@@ -128,12 +128,12 @@ int starpu_combined_worker_get_rank(void);
  * the architecture of the worker: STARPU_CPU_WORKER for a CPU core,
  * STARPU_CUDA_WORKER for a CUDA device. The value returned for an
  * invalid identifier is unspecified.  */
-enum starpu_archtype starpu_worker_get_type(int id);
+enum starpu_worker_archtype starpu_worker_get_type(int id);
 
 /* Returns the number of workers of the type indicated by the argument. A
  * positive (or null) value is returned in case of success, -EINVAL indicates
  * that the type is not valid otherwise. */
-int starpu_worker_get_count_by_type(enum starpu_archtype type);
+int starpu_worker_get_count_by_type(enum starpu_worker_archtype type);
 
 /* Fill the workerids array with the identifiers of the workers that have the
  * type indicated in the first argument. The maxsize argument indicates the
@@ -144,13 +144,13 @@ int starpu_worker_get_count_by_type(enum starpu_archtype type);
  * overflows, the value of maxsize can be chosen by the means of the
  * starpu_worker_get_count_by_type function, or by passing a value greater or
  * equal to STARPU_NMAXWORKERS. */
-int starpu_worker_get_ids_by_type(enum starpu_archtype type, int *workerids, int maxsize);
+int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 /* Return the identifier of the n-th worker of a specific type */
-int starpu_worker_get_by_type(enum starpu_archtype type, int num);
+int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num);
 
 /* Return the identifier of the worker devid of a specific type */
-int starpu_worker_get_by_devid(enum starpu_archtype type, int devid);
+int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid);
 
 /* StarPU associates a unique human readable string to each processing unit.
  * This function copies at most the "maxlen" first bytes of the unique

+ 4 - 0
mpi/include/starpu_mpi.h

@@ -70,6 +70,10 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 
+/* getter/setter for communication tag used for all communications in StarPU-MPI. */
+int starpu_mpi_get_communication_tag(void);
+void starpu_mpi_set_communication_tag(int tag);
+
 #ifdef __cplusplus
 }
 #endif

+ 394 - 87
mpi/src/starpu_mpi.c

@@ -57,6 +57,124 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
+struct _starpu_mpi_envelope
+{
+	ssize_t psize;
+	int mpi_tag;
+};
+
+struct _starpu_mpi_copy_handle
+{
+	starpu_data_handle_t handle;
+	struct _starpu_mpi_envelope *env;
+	int mpi_tag;
+	UT_hash_handle hh;
+};
+
+ /********************************************************/
+ /*                                                      */
+ /*  Hashmap's requests functionalities                  */
+ /*                                                      */
+ /********************************************************/
+
+static struct _starpu_mpi_req *_starpu_mpi_req_hashmap = NULL;
+static struct _starpu_mpi_copy_handle *_starpu_mpi_copy_handle_hashmap = NULL;
+
+static struct _starpu_mpi_req* find_req(int mpi_tag)
+{
+	struct _starpu_mpi_req* req; // = malloc(sizeof(struct _starpu_mpi_req));
+
+	HASH_FIND_INT(_starpu_mpi_req_hashmap, &mpi_tag, req);
+
+	return req;
+}
+
+static void add_req(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = find_req(req->mpi_tag);
+
+	if (test_req == NULL)
+	{
+		HASH_ADD_INT(_starpu_mpi_req_hashmap, mpi_tag, req);
+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the hashmap. \n", req, req->mpi_tag);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "Error add_req : request %p with tag %d already in the hashmap. \n", req, req->mpi_tag);
+		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
+		if (seq_const)
+		{
+			STARPU_ASSERT_MSG(!test_req, "Error add_req : request %p with tag %d wanted to be added to the hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
+		}
+		else
+		{
+			STARPU_ASSERT_MSG(!test_req, "Error add_req : request %p with tag %d wanted to be added to the hashmap, while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, test_req);
+		}
+	}
+}
+
+static void delete_req(struct _starpu_mpi_req *req)
+{
+	struct _starpu_mpi_req *test_req;
+
+	test_req = find_req(req->mpi_tag);
+
+	if (test_req != NULL)
+	{
+		HASH_DEL(_starpu_mpi_req_hashmap, req);
+		_STARPU_MPI_DEBUG(3, "Deleting request %p with tag %d from the hashmap. \n", req, req->mpi_tag);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "Warning delete_req : request %p with tag %d isn't in the hashmap. \n", req, req->mpi_tag);
+	}
+}
+
+static struct _starpu_mpi_copy_handle* find_chandle(int mpi_tag)
+{
+	struct _starpu_mpi_copy_handle* chandle;
+
+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap, &mpi_tag, chandle);
+
+	return chandle;
+}
+
+static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
+{
+	struct _starpu_mpi_copy_handle *test_chandle;
+
+	test_chandle = find_chandle(chandle->mpi_tag);
+
+	if (test_chandle == NULL)
+	{
+		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap, mpi_tag, chandle);
+		_STARPU_MPI_DEBUG(3, "Adding copied handle %p with tag %d in the hashmap. \n", chandle, chandle->mpi_tag);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "Error add_chandle : copied handle %p with tag %d already in the hashmap. \n", chandle, chandle->mpi_tag);
+		STARPU_ASSERT(test_chandle != NULL);
+	}
+}
+
+static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
+{
+	struct _starpu_mpi_copy_handle *test_chandle;
+
+	test_chandle = find_chandle(chandle->mpi_tag);
+
+	if (test_chandle != NULL)
+	{
+		HASH_DEL(_starpu_mpi_copy_handle_hashmap, chandle);
+		_STARPU_MPI_DEBUG(3, "Deleting copied handle %p with tag %d from the hashmap. \n", chandle, chandle->mpi_tag);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "Warning delete_chandle : copied handle %p with tag %d isn't in the hashmap. \n", chandle, chandle->mpi_tag);
+	}
+}
 
 /********************************************************/
 /*                                                      */
@@ -68,7 +186,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 							      int srcdst, int mpi_tag, MPI_Comm comm,
 							      unsigned detached, void (*callback)(void *), void *arg,
 							      enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
-							      enum starpu_access_mode mode)
+							      enum starpu_data_access_mode mode)
 {
 
 	_STARPU_MPI_LOG_IN();
@@ -118,13 +236,13 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
 	STARPU_ASSERT_MSG(req->ptr, "Pointer containing data to send is invalid");
 
-	_STARPU_MPI_DEBUG(2, "post MPI isend request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(2, "post MPI isend request %p type %s tag %d src %d data %p datasize %ld ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
 	TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
 
-	req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+	req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
 
 	TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
@@ -143,43 +261,51 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 {
 	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
+
+	struct _starpu_mpi_envelope* env = calloc(1,sizeof(struct _starpu_mpi_envelope));
+
+	env->mpi_tag = req->mpi_tag;
+
 	if (req->user_datatype == 0)
 	{
 		req->count = 1;
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+		req->ptr = starpu_data_get_local_ptr(req->data_handle);
+
+		env->psize = (ssize_t)req->count;
+
+		_STARPU_MPI_DEBUG(1, "Post MPI isend count (%ld) datatype_size %ld request to %d with tag %d\n",req->count,starpu_data_get_size(req->data_handle),req->srcdst, _starpu_mpi_tag);
+		MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 	}
 	else
 	{
-		ssize_t psize = -1;
 		int ret;
 
-		// Do not pack the data, just try to find out the size
-		starpu_handle_pack_data(req->data_handle, NULL, &psize);
+ 		// Do not pack the data, just try to find out the size
+		starpu_data_pack(req->data_handle, NULL, &(env->psize));
 
-		if (psize != -1)
-		{
-			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), req->mpi_tag, req->srcdst);
-			req->count = psize;
-			ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
+		if (env->psize != -1)
+ 		{
+ 			// We already know the size of the data, let's send it to overlap with the packing of the data
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", env->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
+			req->count = env->psize;
+			ret = MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
-		}
-
-		// Pack the data
-		starpu_handle_pack_data(req->data_handle, &req->ptr, &req->count);
-		if (psize == -1)
-		{
-			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->count, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), req->mpi_tag, req->srcdst);
-			ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
+ 		}
+
+ 		// Pack the data
+ 		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
+		if (env->psize == -1)
+ 		{
+ 			// We know the size now, let's send it
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", env->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
+			ret = MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
-		}
-		else
-		{
-			// We check the size returned with the 2 calls to pack is the same
-			STARPU_ASSERT_MSG(req->count == psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, psize);
-		}
-
+ 		}
+ 		else
+ 		{
+ 			// We check the size returned with the 2 calls to pack is the same
+			STARPU_ASSERT_MSG(req->count == env->psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, env->psize);
+ 		}
 		// We can send the data now
 	}
 	_starpu_mpi_isend_data_func(req);
@@ -234,7 +360,7 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 
 /********************************************************/
 /*                                                      */
-/*  Receive functionalities                             */
+/*  receive functionalities                             */
 /*                                                      */
 /********************************************************/
 
@@ -248,7 +374,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
-	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
 	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %d", req->ret);
 
 	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
@@ -264,48 +390,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-struct _starpu_mpi_irecv_size_callback
-{
-	starpu_data_handle_t handle;
-	struct _starpu_mpi_req *req;
-};
-
-static void _starpu_mpi_irecv_size_callback(void *arg)
-{
-	struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
-
-	starpu_data_unregister(callback->handle);
-	callback->req->ptr = malloc(callback->req->count);
-	STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld", callback->req->count);
-	_starpu_mpi_irecv_data_func(callback->req);
-	free(callback);
-}
-
-static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
-{
-	_STARPU_MPI_LOG_IN();
-
-	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
-	if (req->user_datatype == 0)
-	{
-		req->count = 1;
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
-		_starpu_mpi_irecv_data_func(req);
-	}
-	else
-	{
-		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
-		callback->req = req;
-		starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
-		_STARPU_MPI_DEBUG(4, "Receiving size with tag %d from node %d\n", req->mpi_tag, req->srcdst);
-		_starpu_mpi_irecv_common(callback->handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, callback);
-	}
-
-}
-
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_size_func, STARPU_W);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -622,14 +709,24 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 				STARPU_ASSERT_MSG(flag, "MPI_Test returning flag %d", flag);
 			}
 			if (req->request_type == RECV_REQ)
-				// req->ptr is freed by starpu_handle_unpack_data
-				starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
+				// req->ptr is freed by starpu_data_unpack
+				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 			else
 				free(req->ptr);
 		}
 		else
 		{
-			_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
+			struct _starpu_mpi_copy_handle *chandle = find_chandle(starpu_data_get_tag(req->data_handle));
+			if (chandle && (req->data_handle != chandle->handle))
+			{
+				_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
+				delete_chandle(chandle);
+				free(chandle);
+			}
+			else
+			{
+				_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
+			}
 		}
 		starpu_data_release(req->data_handle);
 	}
@@ -647,6 +744,44 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
+struct _starpu_mpi_copy_cb_args
+{
+	starpu_data_handle_t data_handle;
+	starpu_data_handle_t copy_handle;
+	struct _starpu_mpi_req *req;
+};
+
+static void _starpu_mpi_copy_cb(void* arg)
+{
+	struct _starpu_mpi_copy_cb_args *args = arg;
+
+	struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
+	void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
+	void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
+
+	if (!itf->copy_methods->ram_to_ram)
+	{
+		_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
+		itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
+		itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
+	}
+
+	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
+	starpu_data_release(args->copy_handle);
+
+	_STARPU_MPI_DEBUG(3, "Done, handling unregister of copy_handle..\n");
+	starpu_data_unregister_submit(args->copy_handle);
+
+	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
+	_starpu_mpi_handle_request_termination(args->req);
+
+	free(args);
+}
+
 static void _starpu_mpi_submit_new_mpi_request(void *arg)
 {
 	_STARPU_MPI_LOG_IN();
@@ -655,11 +790,76 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	_starpu_mpi_req_list_push_front(new_requests, req);
-	newer_requests = 1;
-	_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-	STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+
+	if (req->request_type == RECV_REQ)
+	{
+		/* test whether the receive request has already been submitted internally by StarPU-MPI*/
+		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag);
+
+		/* Case : the request has already been submitted internally by StarPU.
+		 * We'll asynchronously ask a Read permission over the temporary handle, so as when
+		 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
+		 * bring the data back to the original data handle associated to the request.*/
+		if (chandle && (req->data_handle != chandle->handle))
+		{
+			_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
+
+			struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
+			cb_args->data_handle = req->data_handle;
+			cb_args->copy_handle = chandle->handle;
+			cb_args->req = req;
+
+			_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
+			starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
+		}
+		else
+		{
+			/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
+			 * incoming data without a matching pending receive already submitted by the application.
+			 * We immediately allocate the pointer associated to the data_handle, and pushing it into
+			 * the list of new_requests, so as the real MPI request can be submitted before the next
+			 * submission of the envelope-catching request. */
+			if (chandle && (req->data_handle == chandle->handle))
+			{
+				_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
+				if (req->user_datatype == 0)
+				{
+					req->count = 1;
+					req->ptr = starpu_data_get_local_ptr(req->data_handle);
+				}
+				else
+				{
+					req->count = chandle->env->psize;
+					req->ptr = malloc(req->count);
+
+					STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
+				}
+
+				_starpu_mpi_req_list_push_front(new_requests, req);
+
+				_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+			}
+			/* Case : a classic receive request with no send received earlier than expected.
+			 * We just add the pending receive request to the requests' hashmap. */
+			else
+			{
+				add_req(req);
+			}
+
+			newer_requests = 1;
+			STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+		}
+	}
+	else
+	{
+		_starpu_mpi_req_list_push_front(new_requests, req);
+
+		newer_requests = 1;
+		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
+				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+		STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+	}
+
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_MPI_LOG_OUT();
 }
@@ -700,6 +900,7 @@ static void _starpu_mpi_test_detached_requests(void)
 
 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 		req->ret = MPI_Test(&req->request, &flag, &status);
+
 		STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %d", req->ret);
 
 		if (flag)
@@ -826,11 +1027,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 	     TRACE_MPI_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
-	     starpu_set_profiling_id(rank);
+	     starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
 	}
 
-
 	/* notify the main thread that the progression thread is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 1;
@@ -838,10 +1038,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+
+ 	struct _starpu_mpi_envelope *recv_env = calloc(1,sizeof(struct _starpu_mpi_envelope));
+
+ 	MPI_Request header_req;
+ 	int header_req_submitted = 0;
+
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests);
+		_STARPU_MPI_DEBUG(3, "HASH_COUNT(_starpu_mpi_req_hashmap) = %d\n",HASH_COUNT(_starpu_mpi_req_hashmap));
+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_req_hashmap) == 0);
 
 #ifndef STARPU_MPI_ACTIVITY
 		block = block && _starpu_mpi_req_list_empty(detached_requests);
@@ -861,11 +1068,6 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			TRACE_MPI_SLEEP_END();
 		}
 
-		/* test whether there are some terminated "detached request" */
-		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-		_starpu_mpi_test_detached_requests();
-		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-
 		/* get one request */
 		struct _starpu_mpi_req *req;
 		while (!_starpu_mpi_req_list_empty(new_requests))
@@ -880,11 +1082,114 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			_starpu_mpi_handle_new_request(req);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
+
+		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
+		 * requests in our side, we resubmit a header request. */
+		if ((HASH_COUNT(_starpu_mpi_req_hashmap) > 0) && (header_req_submitted == 0) && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
+		{
+			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
+
+			_STARPU_MPI_DEBUG(3, "Submit of header_req OK!\n");
+			header_req_submitted = 1;
+		}
+
+		/* test whether there are some terminated "detached request" */
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		_starpu_mpi_test_detached_requests();
+		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+
+		if (header_req_submitted == 1)
+		{
+			int flag,res;
+			MPI_Status status;
+			_STARPU_MPI_DEBUG(3, "Test of header_req\n");
+
+			/* test whether an envelope has arrived. */
+			res = MPI_Test(&header_req, &flag, &status);
+			STARPU_ASSERT(res == MPI_SUCCESS);
+
+			if (flag)
+			{
+				_STARPU_MPI_DEBUG(3, "header_req received !\n");
+
+				_STARPU_MPI_DEBUG(3, "Searching for request with tag %d, size %ld ..\n",recv_env->mpi_tag, recv_env->psize);
+
+				struct _starpu_mpi_req *found_req = find_req(recv_env->mpi_tag);
+
+				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
+				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
+				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
+				if (!found_req)
+				{
+					_STARPU_MPI_DEBUG(3, "Request with tag %d not found, creating a copy_handle to receive incoming data..\n",recv_env->mpi_tag);
+
+					starpu_data_handle_t data_handle = NULL;
+
+					while(!(data_handle))
+					{
+						data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
+					}
+					STARPU_ASSERT(data_handle);
+
+					struct _starpu_mpi_copy_handle* chandle = malloc(sizeof(struct _starpu_mpi_copy_handle));
+					STARPU_ASSERT(chandle);
+
+					chandle->mpi_tag = recv_env->mpi_tag;
+					chandle->env = recv_env;
+					starpu_data_register_same(&chandle->handle, data_handle);
+					add_chandle(chandle);
+
+					_STARPU_MPI_DEBUG(3, "Posting internal starpu_irecv_detached on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
+
+					res = starpu_mpi_irecv_detached(chandle->handle,status.MPI_SOURCE,chandle->mpi_tag,MPI_COMM_WORLD,NULL,NULL);
+					STARPU_ASSERT(res == MPI_SUCCESS);
+
+					_STARPU_MPI_DEBUG(3, "Success of starpu_irecv_detached on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
+				}
+				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
+				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
+				else
+				{
+					_STARPU_MPI_DEBUG(3, "Found !\n");
+
+					delete_req(found_req);
+
+					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
+					if (found_req->user_datatype == 0)
+					{
+						found_req->count = 1;
+						found_req->ptr = starpu_data_get_local_ptr(found_req->data_handle);
+					}
+					else
+					{
+						found_req->count = recv_env->psize;
+						found_req->ptr = malloc(found_req->count);
+
+						STARPU_ASSERT_MSG(found_req->ptr, "cannot allocate message of size %ld\n", found_req->count);
+					}
+
+					_STARPU_MPI_DEBUG(3, "Handling new request... \n");
+					/* handling a request is likely to block for a while
+					 * (on a sync_data_with_mem call), we want to let the
+					 * application submit requests in the meantime, so we
+					 * release the lock. */
+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+					_starpu_mpi_handle_new_request(found_req);
+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+				}
+				header_req_submitted = 0;
+			}
+			else
+			{
+				_STARPU_MPI_DEBUG(3, "Nothing received, continue ..\n");
+			}
+		}
 	}
 
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
+	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_req_hashmap) == 0, "Number of receive requests left is not zero");
 
 	if (argc_argv->initialize_mpi)
 	{
@@ -895,6 +1200,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	free(argc_argv);
+	free(recv_env);
+
 	return NULL;
 }
 

+ 2 - 2
mpi/src/starpu_mpi_datatype.c

@@ -122,7 +122,7 @@ static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID]
 
 void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
 {
-	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
 
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
@@ -183,7 +183,7 @@ static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE
 
 void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
-	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
 
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{

+ 16 - 9
mpi/src/starpu_mpi_insert_task.c

@@ -176,7 +176,7 @@ void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 }
 
 static
-int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 {
 	if (data && mode & STARPU_R)
 	{
@@ -235,7 +235,7 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 }
 
 static
-void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 {
 	if (data && mode & STARPU_R)
 	{
@@ -276,7 +276,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 }
 
 static
-void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
 {
 	if (mode & STARPU_W)
 	{
@@ -308,7 +308,7 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 	}
 }
 
-void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute, MPI_Comm comm)
 {
 	if (_cache_enabled)
 	{
@@ -405,7 +405,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
 		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
 			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
 			if (ret == -EINVAL)
 			{
@@ -421,7 +421,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			int i;
 			for(i=0 ; i<nb_handles ; i++)
 			{
-				enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(codelet, current_data);
+				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(codelet, current_data);
 				int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
 				if (ret == -EINVAL)
 				{
@@ -518,7 +518,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
 		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
 
 			_starpu_mpi_exchange_data_before_execution(data, mode, me, dest, do_execute, comm);
 			current_data ++;
@@ -614,7 +614,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
 			{
 				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+				enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
 
 				_starpu_mpi_exchange_data_after_execution(data, mode, me, xrank, dest, do_execute, comm);
 				current_data++;
@@ -684,7 +684,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX)
 		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
 
 			_starpu_mpi_clear_data_after_execution(data, mode, me, do_execute, comm);
 			current_data++;
@@ -877,4 +877,11 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
 		starpu_insert_task(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
+	/* FIXME: In order to prevent simultaneous receive submissions
+	 * on the same handle, we need to wait that all the starpu_mpi
+	 * tasks are done before submitting next tasks. The current
+	 * version of the implementation does not support multiple
+	 * simultaneous receive requests on the same handle.*/
+	starpu_task_wait_for_all();
+
 }

+ 12 - 0
mpi/src/starpu_mpi_private.c

@@ -15,11 +15,23 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <starpu_mpi_private.h>
+
 int _debug_rank=-1;
 int _debug_level=0;
+int _starpu_mpi_tag = 42;
 
 void _starpu_mpi_set_debug_level(int level)
 {
 	_debug_level = level;
 }
 
+int starpu_mpi_get_communication_tag(void)
+{
+	return _starpu_mpi_tag;
+}
+
+void starpu_mpi_set_communication_tag(int tag)
+{
+	_starpu_mpi_tag = tag;
+}

+ 5 - 0
mpi/src/starpu_mpi_private.h

@@ -20,6 +20,7 @@
 
 #include <starpu.h>
 #include <common/config.h>
+#include <common/uthash.h>
 #include "starpu_mpi.h"
 #include "starpu_mpi_fxt.h"
 #include <common/list.h>
@@ -68,6 +69,8 @@ void _starpu_mpi_set_debug_level(int level);
 #  define _STARPU_MPI_LOG_OUT()
 #endif
 
+extern int _starpu_mpi_tag;
+
 enum _starpu_mpi_request_type
 {
 	SEND_REQ=0,
@@ -108,6 +111,8 @@ LIST_TYPE(_starpu_mpi_req,
 	unsigned submitted;
 	unsigned completed;
 
+	UT_hash_handle hh;
+
 	/* In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */
 	struct _starpu_mpi_req *other_request;

+ 1 - 1
mpi/tests/insert_task_owner2.c

@@ -108,7 +108,7 @@ int main(int argc, char **argv)
 		if (rank == 0)
 		{
 			starpu_data_acquire(data_handles[i], STARPU_R);
-			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
 			starpu_data_release(data_handles[i]);
 		}
 		starpu_data_unregister(data_handles[i]);

+ 1 - 1
mpi/tests/insert_task_owner_data.c

@@ -85,7 +85,7 @@ int main(int argc, char **argv)
 		if (rank == 0)
 		{
 			starpu_data_acquire(data_handles[i], STARPU_R);
-			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
 			starpu_data_release(data_handles[i]);
 		}
 	}

+ 1 - 1
mpi/tests/user_defined_datatype_value.h

@@ -74,7 +74,7 @@ static size_t value_get_size(starpu_data_handle_t handle)
 static uint32_t value_footprint(starpu_data_handle_t handle)
 {
 	int *x = starpu_value_get(handle);
-	return starpu_crc32_be(*x, 0);
+	return starpu_hash_crc32c_be(*x, 0);
 }
 
 static void *value_handle_to_pointer(starpu_data_handle_t handle, unsigned node)

+ 6 - 6
sc_hypervisor/examples/cholesky/cholesky_models.c

@@ -36,7 +36,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -51,7 +51,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -66,7 +66,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -81,7 +81,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -96,7 +96,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -111,7 +111,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_a
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 

+ 3 - 3
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -107,7 +107,7 @@ int *sc_hypervisor_get_sched_ctxs();
 int sc_hypervisor_get_nsched_ctxs();
 
 /* get the number of workers of a certain architecture in a context */
-int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch);
+int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch);
 
 /* get the number of flops executed by a context since last resizing (reset to 0 when a resizing is done)*/
 double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
@@ -116,10 +116,10 @@ double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrappe
 double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
 
 /* compute an average value of the cpu/cuda velocity */
-double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
 /* compte the actual velocity of all workers of a specific type of worker */
-double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_archtype arch);
+double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
 
 #ifdef __cplusplus
 }

+ 5 - 5
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -59,13 +59,13 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move);
 
 /* find the first most idle workers of a context*/
-int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch);
+int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_worker_archtype arch);
 
 /* find the first most idle workers in a list */
-int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch);
+int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_worker_archtype arch);
 
 /* find workers that can be moved from a context (if the constraints of min, max, etc allow this) */
-unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch);
+unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch);
 
 /* compute how many workers should be moved from this context */
 int sc_hypervisor_compute_nworkers_to_move(unsigned req_sched_ctx);
@@ -89,10 +89,10 @@ double sc_hypervisor_get_fastest_ctx_exec_time(void);
 double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker); 
 
 /* compute the velocity of a type of worker in a context */
-double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
 /* compute the velocity of a type of worker in a context depending on its history */ 
-double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
 /* check if there are contexts a lot more delayed than others */
 int sc_hypervisor_has_velocity_gap_btw_ctxs(void);

+ 2 - 2
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -40,7 +40,7 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
 			w_in_s[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 
-			enum starpu_archtype arch = starpu_worker_get_type(worker);
+			enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
 			velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
 		}
 	}
@@ -258,7 +258,7 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
 				{
 					for(w = 0; w < nw; w++)
 					{
-						enum starpu_archtype arch = starpu_worker_get_type(w);
+						enum starpu_worker_archtype arch = starpu_worker_get_type(w);
 
 						if(arch == STARPU_CUDA_WORKER)
 						{

+ 2 - 2
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -281,7 +281,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 			velocity[s][w] = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
 			if(velocity[s][w] == -1.0)
 			{
-				enum starpu_archtype arch = starpu_worker_get_type(worker);
+				enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
 				velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
 				if(arch == STARPU_CUDA_WORKER)
 				{
@@ -367,7 +367,7 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct s
 				{
 					for(w = 0; w < nw; w++)
 					{
-						enum starpu_archtype arch = starpu_worker_get_type(w);
+						enum starpu_worker_archtype arch = starpu_worker_get_type(w);
 
 						if(arch == STARPU_CUDA_WORKER)
 						{

+ 2 - 2
sc_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -62,7 +62,7 @@ static unsigned _get_slowest_sched_ctx(void)
 
 
 /* get first nworkers with the highest idle time in the context */
-static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
+static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_worker_archtype arch)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
@@ -87,7 +87,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 		{
 			considered = 0;
 			worker = workers->get_next(workers, &it);
-			enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+			enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
 			if(arch == STARPU_ANY_WORKER || curr_arch == arch)
 			{
 

+ 4 - 4
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -136,7 +136,7 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 	int w;
 	for(w = 0; w < nw; w++)
 	{
-		enum starpu_archtype arch = STARPU_ANY_WORKER;
+		enum starpu_worker_archtype arch = STARPU_ANY_WORKER;
 		if(w == 0) arch = STARPU_CUDA_WORKER;
 		if(w == 1) arch = STARPU_CPU_WORKER;
 		
@@ -210,7 +210,7 @@ void _lp_find_workers_to_accept(int nw, int ns, unsigned sched_ctx, int sched_ct
 	int j = 0, k = 0;
 	for(w = 0; w < nw; w++)
 	{
-		enum starpu_archtype arch = STARPU_ANY_WORKER;
+		enum starpu_worker_archtype arch = STARPU_ANY_WORKER;
 		if(w == 0) arch = STARPU_CUDA_WORKER;
 		if(w == 1) arch = STARPU_CPU_WORKER;
 		
@@ -373,7 +373,7 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 		
 		for(w = 0; w < nw; w++)
 		{
-			enum starpu_archtype arch;
+			enum starpu_worker_archtype arch;
 
 #ifdef STARPU_USE_CUDA
 			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
@@ -456,7 +456,7 @@ void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][n
 	{
 		for(w = 0; w < nw; w++)
 		{
-			enum starpu_archtype arch = starpu_worker_get_type(w);
+			enum starpu_worker_archtype arch = starpu_worker_get_type(w);
 			
 			if(arch == STARPU_CUDA_WORKER)
 			{

+ 16 - 16
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -73,7 +73,7 @@ unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nw
 	return sched_ctx;
 }
 
-int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch)
+int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_worker_archtype arch)
 {
 	int *curr_workers = (int*)malloc((*nworkers)*sizeof(int));
 
@@ -85,7 +85,7 @@ int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall
 			break;
 
 		worker = workers == NULL ? w : workers[w];
-		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
 		if(arch == STARPU_ANY_WORKER || curr_arch == arch)
 		{
 			if(w >= *start)
@@ -101,7 +101,7 @@ int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall
 }
 
 /* get first nworkers with the highest idle time in the context */
-int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
+int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_worker_archtype arch)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
@@ -126,7 +126,7 @@ int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum st
 		{
 			considered = 0;
 			worker = workers->get_next(workers, &it);
-			enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+			enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
 			if(arch == STARPU_ANY_WORKER || curr_arch == arch)
 			{
 
@@ -176,7 +176,7 @@ int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum st
 }
 
 /* get the number of workers in the context that are allowed to be moved (that are not fixed) */
-unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch)
+unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 
@@ -189,7 +189,7 @@ unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
                 if(arch == STARPU_ANY_WORKER || curr_arch == arch)
                 {
 			if(!config->fixed_workers[worker])
@@ -300,7 +300,7 @@ unsigned sc_hypervisor_policy_resize_to_unknown_receiver(unsigned sender_sched_c
 	return sc_hypervisor_policy_resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
 }
 
-static double _get_ispeed_sample_for_type_of_worker(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype req_arch)
+static double _get_ispeed_sample_for_type_of_worker(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype req_arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
         int worker;
@@ -314,7 +314,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sc_hypervisor_wrapper
         while(workers->has_next(workers, &it))
 	{
                 worker = workers->get_next(workers, &it);
-                enum starpu_archtype arch = starpu_worker_get_type(worker);
+                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
                 if(arch == req_arch)
                 {
 			struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
@@ -454,7 +454,7 @@ double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w,
                 double curr_time = starpu_timing_now();
 		size_t elapsed_data_used = sc_w->elapsed_data[worker];
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
- 		enum starpu_archtype arch = starpu_worker_get_type(worker);
+ 		enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
 		if(arch == STARPU_CUDA_WORKER)
 		{
 /* 			unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx); */
@@ -480,7 +480,7 @@ double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w,
 
 }
 
-static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
+static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_worker_archtype req_arch)
 {
 	double ret_val = 0.0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
@@ -493,7 +493,7 @@ static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *n
         while(workers->has_next(workers, &it))
 	{
                 worker = workers->get_next(workers, &it);
-                enum starpu_archtype arch = starpu_worker_get_type(worker);
+                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
                 if(arch == req_arch)
                 {
 			if(sc_w->elapsed_flops[worker] > ret_val)
@@ -506,7 +506,7 @@ static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *n
 }
 
 /* compute an average value of the cpu/cuda velocity */
-double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
         int npus = 0;
         double elapsed_flops = _get_best_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
@@ -575,7 +575,7 @@ void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_
 
 	for(w = 0; w < current_nworkers; w++)
 	{
- 		enum starpu_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
+ 		enum starpu_worker_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
 			starpu_worker_get_type(workers[w]);
 		if(ntypes_of_workers == 2)
 		{
@@ -598,8 +598,8 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                 for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
                 {
 			int worker = workers == NULL ? w : workers[w];
-                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(worker);
-                        double length = starpu_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
+                        enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(worker);
+                        double length = starpu_permodel_history_based_expected_perf(tp->cl->model, arch, tp->footprint);
 
                         if (isnan(length))
                                 times[w][t] = NAN;
@@ -608,7 +608,7 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                                 times[w][t] = length / 1000.;
 
 				double transfer_time = 0.0;
-				enum starpu_archtype arch = starpu_worker_get_type(worker);
+				enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
 				if(arch == STARPU_CUDA_WORKER)
 				{
 					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);

+ 8 - 8
sc_hypervisor/src/sc_hypervisor.c

@@ -339,7 +339,7 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
-static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
+static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_worker_archtype req_arch)
 {
 	double ret_val = 0.0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
@@ -352,7 +352,7 @@ static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w,
         while(workers->has_next(workers, &it))
 	{
                 worker = workers->get_next(workers, &it);
-                enum starpu_archtype arch = starpu_worker_get_type(worker);
+                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
                 if(arch == req_arch)
                 {
 			if(sc_w->total_elapsed_flops[worker] > ret_val)
@@ -365,7 +365,7 @@ static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w,
 }
 
 /* compute an average value of the cpu/cuda velocity */
-double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
         int npus = 0;
         double elapsed_flops = _get_best_total_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
@@ -384,7 +384,7 @@ double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervi
 }
 
 /* compute an average value of the cpu/cuda old velocity */
-double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
 	double ref_velocity = 0.0;
 	unsigned nw = 0;
@@ -431,13 +431,13 @@ static void _get_cpus(int *workers, int nworkers, int *cpus, int *ncpus)
 	for(i = 0; i < nworkers; i++)
 	{
 		worker = workers[i];
-		enum starpu_archtype arch = starpu_worker_get_type(worker);
+		enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
 		if(arch == STARPU_CPU_WORKER)
 			cpus[(*ncpus)++] = worker;
 	}
 }
 
-int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch)
+int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch)
 {
 	int nworkers_ctx = 0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
@@ -450,7 +450,7 @@ int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
 		if(curr_arch == arch || arch == STARPU_ANY_WORKER)
 			nworkers_ctx++;
 	}
@@ -975,7 +975,7 @@ void sc_hypervisor_free_size_req(void)
 	}
 }
 
-double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
 {
 
 	double velocity = sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(sc_w, arch);

+ 1 - 1
socl/src/cl_enqueuemapbuffer.c

@@ -23,7 +23,7 @@ static void mapbuffer_task(void *args) {
   ev->prof_start = _socl_nanotime();
   gc_entity_release(ev);
 
-	enum starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+	enum starpu_data_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
 
 	starpu_data_acquire_cb(cmd->buffer->handle, mode, command_completed_task_callback, cmd);
 }

+ 1 - 1
src/common/fxt.c

@@ -92,7 +92,7 @@ static void _starpu_profile_set_tracefile(void *last, ...)
 	strcat(_STARPU_PROF_FILE_USER, suffix);
 }
 
-void starpu_set_profiling_id(int new_id)
+void starpu_profiling_set_id(int new_id)
 {
 	_STARPU_DEBUG("Set id to <%d>\n", new_id);
 	_starpu_id = new_id;

+ 11 - 11
src/common/hash.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@
 
 #define _STARPU_CRC32C_POLY_BE 0x1EDC6F41
 
-static inline uint32_t __attribute__ ((pure)) starpu_crc32_be_8(uint8_t inputbyte, uint32_t inputcrc)
+static inline uint32_t __attribute__ ((pure)) starpu_crc32c_be_8(uint8_t inputbyte, uint32_t inputcrc)
 {
 	unsigned i;
 	uint32_t crc;
@@ -33,7 +33,7 @@ static inline uint32_t __attribute__ ((pure)) starpu_crc32_be_8(uint8_t inputbyt
 	return crc;
 }
 
-uint32_t starpu_crc32_be_n(void *input, size_t n, uint32_t inputcrc)
+uint32_t starpu_hash_crc32c_be_n(void *input, size_t n, uint32_t inputcrc)
 {
 	uint8_t *p = (uint8_t *)input;
 	size_t i;
@@ -41,26 +41,26 @@ uint32_t starpu_crc32_be_n(void *input, size_t n, uint32_t inputcrc)
 	uint32_t crc = inputcrc;
 
 	for (i = 0; i < n; i++)
-		crc = starpu_crc32_be_8(p[i], crc);
+		crc = starpu_crc32c_be_8(p[i], crc);
 
 	return crc;
 }
 
-uint32_t starpu_crc32_be(uint32_t input, uint32_t inputcrc)
+uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
 {
 	uint8_t *p = (uint8_t *)&input;
 
 	uint32_t crc = inputcrc;
 
-	crc = starpu_crc32_be_8(p[0], crc);
-	crc = starpu_crc32_be_8(p[1], crc);
-	crc = starpu_crc32_be_8(p[2], crc);
-	crc = starpu_crc32_be_8(p[3], crc);
+	crc = starpu_crc32c_be_8(p[0], crc);
+	crc = starpu_crc32c_be_8(p[1], crc);
+	crc = starpu_crc32c_be_8(p[2], crc);
+	crc = starpu_crc32c_be_8(p[3], crc);
 
 	return crc;
 }
 
-uint32_t starpu_crc32_string(char *str, uint32_t inputcrc)
+uint32_t starpu_hash_crc32c_string(char *str, uint32_t inputcrc)
 {
 	uint32_t hash = inputcrc;
 
@@ -69,7 +69,7 @@ uint32_t starpu_crc32_string(char *str, uint32_t inputcrc)
 	unsigned i;
 	for (i = 0; i < len; i++)
 	{
-		hash = starpu_crc32_be_8((uint8_t)str[i], hash);
+		hash = starpu_crc32c_be_8((uint8_t)str[i], hash);
 	}
 
 	return hash;

+ 1 - 1
src/core/combined_workers.c

@@ -95,7 +95,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 		&config->combined_workers[combined_worker_id];
 
 	combined_worker->worker_size = nworkers;
-	combined_worker->perf_arch = (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
+	combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
 	combined_worker->worker_mask = STARPU_CPU;
 
 	/* We assume that the memory node should either be that of the first

+ 8 - 8
src/core/dependencies/data_concurrency.c

@@ -61,7 +61,7 @@ static struct _starpu_data_requester *may_unlock_data_req_list_head(starpu_data_
 	/* data->current_mode == STARPU_R, so we can process more readers */
 	struct _starpu_data_requester *r = _starpu_data_requester_list_front(req_list);
 
-	enum starpu_access_mode r_mode = r->mode;
+	enum starpu_data_access_mode r_mode = r->mode;
 	if (r_mode == STARPU_RW)
 		r_mode = STARPU_W;
 
@@ -79,7 +79,7 @@ static struct _starpu_data_requester *may_unlock_data_req_list_head(starpu_data_
  * with the current mode, the request is put in the per-handle list of
  * "requesters", and this function returns 1. */
 static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_codelet,
-						       starpu_data_handle_t handle, enum starpu_access_mode mode,
+						       starpu_data_handle_t handle, enum starpu_data_access_mode mode,
 						       void (*callback)(void *), void *argcb,
 						       struct _starpu_job *j, unsigned buffer_index)
 {
@@ -117,7 +117,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 	 * current one, we can proceed. */
 	unsigned put_in_list = 1;
 
-	enum starpu_access_mode previous_mode = handle->current_mode;
+	enum starpu_data_access_mode previous_mode = handle->current_mode;
 
 	if (!frozen && ((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
 	{
@@ -182,7 +182,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
 }
 
-unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle, enum starpu_access_mode mode,
+unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle, enum starpu_data_access_mode mode,
 							  void (*callback)(void *), void *argcb)
 {
 	return _starpu_attempt_to_submit_data_request(0, handle, mode, callback, argcb, NULL, 0);
@@ -193,7 +193,7 @@ static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, u
 	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	 * which is a sorted copy of it. */
 	starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buffer_index);
-	enum starpu_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index);
+	enum starpu_data_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index);
 
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 }
@@ -245,7 +245,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 
@@ -297,7 +297,7 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 	while ((r = may_unlock_data_req_list_head(handle)))
 	{
 		/* STARPU_RW accesses are treated as STARPU_W */
-		enum starpu_access_mode r_mode = r->mode;
+		enum starpu_data_access_mode r_mode = r->mode;
 		if (r_mode == STARPU_RW)
 			r_mode = STARPU_W;
 
@@ -328,7 +328,7 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 			handle->refcnt++;
 			handle->busy_count++;
 
-			enum starpu_access_mode previous_mode = handle->current_mode;
+			enum starpu_data_access_mode previous_mode = handle->current_mode;
 			handle->current_mode = r_mode;
 
 			/* In case we enter in a reduction mode, we invalidate all per

+ 1 - 1
src/core/dependencies/data_concurrency.h

@@ -25,7 +25,7 @@ unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
-							  enum starpu_access_mode mode,
+							  enum starpu_data_access_mode mode,
 							  void (*callback)(void *), void *argcb);
 
 #endif // __DATA_CONCURRENCY_H__

+ 5 - 5
src/core/dependencies/implicit_data_deps.c

@@ -228,7 +228,7 @@ static void _starpu_add_writer_after_writer(starpu_data_handle_t handle, struct
 /* NB : handle->sequential_consistency_mutex must be hold by the caller;
  * returns a task, to be submitted after releasing that mutex. */
 struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						   starpu_data_handle_t handle, enum starpu_access_mode mode)
+						   starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	struct starpu_task *task = NULL;
 
@@ -259,7 +259,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 			_starpu_bound_task_dep(post_sync_job, pre_sync_job);
 		}
 
-		enum starpu_access_mode previous_mode = handle->last_submitted_mode;
+		enum starpu_data_access_mode previous_mode = handle->last_submitted_mode;
 
 		if (mode & STARPU_W)
 		{
@@ -337,7 +337,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
 		struct starpu_task *new_task;
 
 		/* Scratch memory does not introduce any deps */
@@ -457,7 +457,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
-        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
+        struct starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 
 	if (!task->cl)
 		return;
@@ -548,7 +548,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 
 /* If sequential consistency mode is enabled, this function blocks until the
  * handle is available in the requested access mode. */
-int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_access_mode mode)
+int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	/* If sequential consistency is enabled, wait until data is available */
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);

+ 2 - 2
src/core/dependencies/implicit_data_deps.h

@@ -22,7 +22,7 @@
 #include <common/config.h>
 
 struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						   starpu_data_handle_t handle, enum starpu_access_mode mode);
+						   starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
@@ -31,7 +31,7 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
 void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
 
 /* This function blocks until the handle is available in the requested mode */
-int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_access_mode mode);
+int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 #endif // __IMPLICIT_DATA_DEPS_H__
 

+ 1 - 1
src/core/jobs.c

@@ -53,7 +53,7 @@ struct _starpu_job* __attribute__((malloc)) _starpu_job_create(struct starpu_tas
 	memset(job, 0, sizeof(*job));
 
 	if (task->dyn_handles)
-	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(struct starpu_buffer_descr));
+	     job->dyn_ordered_buffers = malloc(task->cl->nbuffers * sizeof(struct starpu_data_descr));
 
 	job->task = task;
 

+ 3 - 3
src/core/jobs.h

@@ -69,8 +69,8 @@ LIST_TYPE(_starpu_job,
 	/* To avoid deadlocks, we reorder the different buffers accessed to by
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
-	struct starpu_buffer_descr ordered_buffers[STARPU_NMAXBUFS];
-	struct starpu_buffer_descr *dyn_ordered_buffers;
+	struct starpu_data_descr ordered_buffers[STARPU_NMAXBUFS];
+	struct starpu_data_descr *dyn_ordered_buffers;
 
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
@@ -161,7 +161,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 void _starpu_handle_job_termination(struct _starpu_job *j);
 
 /* Get the sum of the size of the data accessed by the job. */
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Get a task from the local pool of tasks that were explicitly attributed to
  * that worker. */

+ 15 - 15
src/core/perfmodel/perfmodel.c

@@ -48,7 +48,7 @@ unsigned _starpu_get_calibrate_flag(void)
 	return calibrate_flag;
 }
 
-enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
+enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
@@ -68,11 +68,11 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
  * PER ARCH model
  */
 
-static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp = NAN;
-	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
-	double (*per_arch_cost_model)(struct starpu_buffer_descr *);
+	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
 	per_arch_cost_function = model->per_arch[arch][nimpl].cost_function;
 	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
@@ -89,7 +89,7 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum s
  * Common model
  */
 
-double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype)
+double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype)
 {
 	if (perf_archtype < STARPU_CUDA_DEFAULT)
 	{
@@ -110,7 +110,7 @@ double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtyp
 	return NAN;
 }
 
-static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
 {
 	double exp;
 	double alpha;
@@ -170,7 +170,7 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 	model->is_loaded = 1;
 }
 
-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perf_archtype arch,  unsigned nimpl)
+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,  unsigned nimpl)
 {
 	if (model)
 	{
@@ -203,19 +203,19 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return 0.0;
 }
 
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 
 double starpu_task_expected_conversion_time(struct starpu_task *task,
-					    enum starpu_perf_archtype arch,
+					    enum starpu_perfmodel_archtype arch,
 					    unsigned nimpl)
 {
 	unsigned i;
@@ -255,7 +255,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 }
 
 /* Predict the transfer time (in µs) to move a handle to a memory node */
-double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode)
+double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode)
 {
 	/* If we don't need to read the content of the handle */
 	if (!(mode & STARPU_R))
@@ -288,7 +288,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, buffer);
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
@@ -297,7 +297,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 }
 
 /* Return the expected duration of the entire task bundle in µs */
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	double expected_length = 0.0;
 
@@ -328,7 +328,7 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 }
 
 /* Return the expected power consumption of the entire task bundle in J */
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	double expected_power = 0.0;
 
@@ -376,7 +376,7 @@ double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundl
 			for (b = 0; b < task->cl->nbuffers; b++)
 			{
 				starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, b);
-				enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, b);
+				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, b);
 
 				if (!(mode & STARPU_R))
 					continue;

+ 6 - 6
src/core/perfmodel/perfmodel.h

@@ -30,16 +30,16 @@ struct _starpu_perfmodel_list
 	struct starpu_perfmodel *model;
 };
 
-struct starpu_buffer_descr;
+struct starpu_data_descr;
 struct _starpu_job;
-enum starpu_perf_archtype;
+enum starpu_perfmodel_archtype;
 
 void _starpu_get_perf_model_dir(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
 int _starpu_register_model(struct starpu_perfmodel *model);
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
@@ -49,10 +49,10 @@ void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
+					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perf_archtype arch,
+					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,
 				unsigned cpuid, double measured, unsigned nimpl);
 
 void _starpu_create_sampling_directory_if_needed(void);

+ 10 - 10
src/core/perfmodel/perfmodel_history.c

@@ -52,7 +52,7 @@ struct starpu_perfmodel_history_table
 static starpu_pthread_rwlock_t registered_models_rwlock;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
 
@@ -391,7 +391,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
 	/* header */
 	char archname[32];
-	starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
+	starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) arch, archname, 32, nimpl);
 	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
@@ -996,7 +996,7 @@ int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
 	return 0;
 }
 
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen,unsigned nimpl)
+void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen,unsigned nimpl)
 {
 	if (arch < STARPU_CUDA_DEFAULT)
 	{
@@ -1031,7 +1031,7 @@ void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archna
 }
 
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
-				    enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl)
+				    enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl)
 {
 	char archname[32];
 	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
@@ -1041,7 +1041,7 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 	get_model_debug_path(model, archname, path, maxlen);
 }
 
-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl)
+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl)
 {
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
@@ -1055,7 +1055,7 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 	return exp;
 }
 
-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j,unsigned nimpl)
+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j,unsigned nimpl)
 {
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
@@ -1098,7 +1098,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	return exp;
 }
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j,unsigned nimpl)
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j,unsigned nimpl)
 {
 	double exp;
 	struct starpu_perfmodel_per_arch *per_arch_model;
@@ -1142,7 +1142,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	return exp;
 }
 
-double starpu_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint)
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint)
 {
 	struct _starpu_job j =
 		{
@@ -1152,7 +1152,7 @@ double starpu_history_based_expected_perf(struct starpu_perfmodel *model, enum s
 	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
 }
 
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
 {
 	if (model)
 	{
@@ -1280,7 +1280,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 	}
 }
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured)
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured)
 {
 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
 

+ 6 - 6
src/core/perfmodel/perfmodel_print.c

@@ -61,7 +61,7 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 	}
 }
 
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
+void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 {
 	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][nimpl];
 	char archname[32];
@@ -177,7 +177,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		{
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 			{ /* Display all codelets on each arch */
-				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
 			}
 		}
 	}
@@ -203,7 +203,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
 			return 0;
 		}
 
@@ -216,9 +216,9 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
 				{
 					char archname[32];
-					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
+					starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) archid, archname, 32, implid);
 					fprintf(output, "performance model for %s\n", archname);
-					starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+					starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
 				}
 			}
 			return 0;
@@ -233,7 +233,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			int archid = STARPU_CUDA_DEFAULT+ gpuid;
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
 			return 0;
 		}
 

+ 202 - 3
src/core/sched_ctx.c

@@ -215,6 +215,33 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 
 }
 
+#ifdef STARPU_HAVE_HWLOC
+static void _starpu_sched_ctx_create_hwloc_tree(struct _starpu_sched_ctx *sched_ctx)
+{
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	sched_ctx->hwloc_workers_set = hwloc_bitmap_alloc();
+
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+	int worker;
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		if(!starpu_worker_is_combined_worker(worker))
+		{
+			hwloc_bitmap_or(sched_ctx->hwloc_workers_set,
+					sched_ctx->hwloc_workers_set,
+					config->workers[worker].initial_hwloc_cpu_set);
+		}
+
+	}
+	return;
+}
+#endif
+
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int *workerids,
 				  int nworkers_ctx, unsigned is_initial_sched,
 				  const char *sched_name)
@@ -247,6 +274,7 @@ struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
 	sched_ctx->finished_submit = 0;
 	sched_ctx->min_priority = 0;
 	sched_ctx->max_priority = 1;
+	sem_init(&sched_ctx->parallel_code_sem, 0, 0);
 
 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
 
@@ -259,6 +287,10 @@ struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
 	/* after having an worker_collection on the ressources add them */
 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
 
+#ifdef STARPU_HAVE_HWLOC
+	/* build hwloc tree of the context */
+	_starpu_sched_ctx_create_hwloc_tree(sched_ctx);
+#endif //STARPU_HAVE_HWLOC
 
 	/* if we create the initial big sched ctx we can update workers' status here
 	   because they haven't been launched yet */
@@ -285,7 +317,7 @@ struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
 	return sched_ctx;
 }
 
-static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_archtype arch, unsigned allow_overlap)
+static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_worker_archtype arch, unsigned allow_overlap)
 {
 	int pus[max];
 	int npus = 0;
@@ -461,6 +493,7 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 	sched_ctx->sched_policy = NULL;
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
+	sem_destroy(&sched_ctx->parallel_code_sem);
 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -862,7 +895,7 @@ struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned
 	return sched_ctx->workers;
 }
 
-int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_archtype arch)
+int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_worker_archtype arch)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 
@@ -877,7 +910,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 	while(workers->has_next(workers, &it))
 	{
 		worker = workers->get_next(workers, &it);
-		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
 		if(curr_arch == arch)
 			pus[npus++] = worker;
 	}
@@ -1102,3 +1135,169 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio)
 	sched_ctx->max_priority = max_prio;
 	return 0;
 }
+
+static void _starpu_sched_ctx_bind_thread_to_ctx_cpus(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+#ifdef STARPU_HAVE_HWLOC	
+	const struct hwloc_topology_support *support = hwloc_topology_get_support(config->topology.hwtopology);
+        if (support->cpubind->set_thisthread_cpubind)
+        {
+		hwloc_bitmap_t set = sched_ctx->hwloc_workers_set;
+                int ret;
+		
+                ret = hwloc_set_cpubind (config->topology.hwtopology, set,
+                                         HWLOC_CPUBIND_THREAD);
+		if (ret)
+                {
+                        perror("binding thread");
+			STARPU_ABORT();
+                }
+	}
+
+#else
+#warning no sched ctx CPU binding support
+#endif
+	return;
+}
+
+void _starpu_sched_ctx_rebind_thread_to_its_cpu(unsigned cpuid)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+#ifdef STARPU_SIMGRID
+	return;
+#endif
+	if (starpu_get_env_number("STARPU_WORKERS_NOBIND") > 0)
+		return;
+
+#ifdef STARPU_HAVE_HWLOC
+	const struct hwloc_topology_support *support = hwloc_topology_get_support (config->topology.hwtopology);
+	if (support->cpubind->set_thisthread_cpubind)
+	{
+		hwloc_obj_t obj = hwloc_get_obj_by_depth (config->topology.hwtopology,
+							  config->cpu_depth, cpuid);
+		hwloc_bitmap_t set = obj->cpuset;
+		int ret;
+		
+		hwloc_bitmap_singlify(set);
+		ret = hwloc_set_cpubind (config->topology.hwtopology, set,
+					 HWLOC_CPUBIND_THREAD);
+		if (ret)
+		{
+			perror("hwloc_set_cpubind");
+			STARPU_ABORT();
+		}
+	}
+
+#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(__linux__)
+	int ret;
+	/* fix the thread on the correct cpu */
+	cpu_set_t aff_mask;
+	CPU_ZERO(&aff_mask);
+	CPU_SET(cpuid, &aff_mask);
+
+	starpu_pthread_t self = pthread_self();
+
+	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
+	if (ret)
+	{
+		perror("binding thread");
+		STARPU_ABORT();
+	}
+
+#elif defined(__MINGW32__) || defined(__CYGWIN__)
+	DWORD mask = 1 << cpuid;
+	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
+	{
+		_STARPU_ERROR("SetThreadMaskAffinity(%lx) failed\n", mask);
+	}
+#else
+#warning no CPU binding support
+#endif
+
+}
+
+static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+	struct starpu_sched_ctx_iterator it;
+	struct _starpu_worker *worker = NULL;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = _starpu_get_worker_struct(workers->get_next(workers, &it));
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+		worker->parallel_sect = 1;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	}
+
+	while(workers->has_next(workers, &it))
+	{
+		int w = workers->get_next(workers, &it);
+		sem_wait(&sched_ctx->parallel_code_sem);
+	}
+	return;
+}
+
+void _starpu_sched_ctx_signal_worker_blocked(int workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		if(worker->sched_ctx[i] != NULL && worker->sched_ctx[i]->id != STARPU_NMAX_SCHED_CTXS
+			&& worker->sched_ctx[i]->id != 0)
+		{
+			sched_ctx = worker->sched_ctx[i];
+			sem_post(&sched_ctx->parallel_code_sem);
+		}
+	}	
+	return;
+}
+
+static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+	struct starpu_sched_ctx_iterator it;
+	struct _starpu_worker *worker = NULL;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = _starpu_get_worker_struct(workers->get_next(workers, &it));
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->parallel_sect_mutex);
+		STARPU_PTHREAD_COND_SIGNAL(&worker->parallel_sect_cond);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->parallel_sect_mutex);
+	}
+	return;
+}
+
+void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id)
+{
+	/* get starpu workers to sleep */
+	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id);
+
+	/* bind current thread on all workers of the context */
+	_starpu_sched_ctx_bind_thread_to_ctx_cpus(sched_ctx_id);
+	
+	/* execute parallel code */
+	void* ret = func(param);
+
+	/* wake up starpu workers */
+	_starpu_sched_ctx_wake_up_workers(sched_ctx_id);
+
+	return ret;
+}
+
+

+ 21 - 1
src/core/sched_ctx.h

@@ -23,6 +23,11 @@
 #include <common/config.h>
 #include <common/barrier_counter.h>
 #include <profiling/profiling.h>
+#include <semaphore.h>
+
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
 
 #define NO_RESIZE -1
 #define REQ_RESIZE 0
@@ -92,6 +97,15 @@ struct _starpu_sched_ctx
      	int min_priority;
 	int max_priority;
 
+	/* semaphore that block appl thread until threads are ready 
+	   to exec the parallel code */
+	sem_t parallel_code_sem;
+
+	/* hwloc tree structure of workers */
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_bitmap_t hwloc_workers_set;
+#endif
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 	/* a structure containing a series of performance counters determining the resize procedure */
 	struct starpu_sched_ctx_performance_counters *perf_counters;
@@ -135,7 +149,7 @@ starpu_pthread_mutex_t *_starpu_get_sched_mutex(struct _starpu_sched_ctx *sched_
 
 /* Get workers belonging to a certain context, it returns the number of workers
  take care: no mutex taken, the list of workers might not be updated */
-int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_archtype arch);
+int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_worker_archtype arch);
 
 /* Let the worker know it does not belong to the context and that
    it should stop poping from it */
@@ -147,6 +161,12 @@ unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_
 /* mutex synchronising several simultaneous modifications of a context */
 starpu_pthread_mutex_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 
+/*rebind each thread on its cpu after finishing a parallel code */
+void _starpu_sched_ctx_rebind_thread_to_its_cpu(unsigned cpuid);
+
+/* let the appl know that the worker blocked to execute parallel code */
+void _starpu_sched_ctx_signal_worker_blocked(int workerid);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_call_poped_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);

+ 1 - 1
src/core/sched_policy.c

@@ -690,7 +690,7 @@ pick:
 profiling:
 	if (profiling)
 	{
-		struct starpu_task_profiling_info *profiling_info;
+		struct starpu_profiling_task_info *profiling_info;
 		profiling_info = task->profiling_info;
 
 		/* The task may have been created before profiling was enabled,

+ 1 - 1
src/core/simgrid.c

@@ -97,7 +97,7 @@ int main(int argc, char **argv)
 }
 
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_execute_job(struct _starpu_job *j, enum starpu_perf_archtype perf_arch, double length)
+void _starpu_simgrid_execute_job(struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch, double length)
 {
 	struct starpu_task *task = j->task;
 	msg_task_t simgrid_task;

+ 1 - 1
src/core/simgrid.h

@@ -30,7 +30,7 @@ struct _starpu_pthread_args
 
 #define MAX_TSD 16
 
-void _starpu_simgrid_execute_job(struct _starpu_job *job, enum starpu_perf_archtype perf_arch, double length);
+void _starpu_simgrid_execute_job(struct _starpu_job *job, enum starpu_perfmodel_archtype perf_arch, double length);
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 /* Return the number of hosts prefixed by PREFIX */
 int _starpu_simgrid_get_nbhosts(const char *prefix);

+ 3 - 3
src/core/task.c

@@ -477,7 +477,7 @@ int starpu_task_submit(struct starpu_task *task)
 
 	/* If profiling is activated, we allocate a structure to store the
 	 * appropriate info. */
-	struct starpu_task_profiling_info *info;
+	struct starpu_profiling_task_info *info;
 	int profiling = starpu_profiling_status_get();
 	info = _starpu_allocate_profiling_info_if_needed(task);
 	task->profiling_info = info;
@@ -548,7 +548,7 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 		{
 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-			enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+			enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 			_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 		}
 	}
@@ -598,7 +598,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_HANDLE(j, handle, i);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(j->task->cl, i);
 		_STARPU_JOB_SET_ORDERED_BUFFER_MODE(j, mode, i);
 	}
 

+ 2 - 2
src/core/task_bundle.c

@@ -188,7 +188,7 @@ void _starpu_task_bundle_destroy(starpu_task_bundle_t bundle)
 	free(bundle);
 }
 
-void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_access_mode mode)
+void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	STARPU_ASSERT(listp);
 
@@ -220,7 +220,7 @@ void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_ha
 	if (prev->handle == handle)
 	{
 		/* The handle is already in the list, the merge both the access modes */
-		prev->mode = (enum starpu_access_mode) ((int) prev->mode | (int) mode);
+		prev->mode = (enum starpu_data_access_mode) ((int) prev->mode | (int) mode);
 	}
 	else
 	{

+ 2 - 2
src/core/task_bundle.h

@@ -90,7 +90,7 @@ struct _starpu_task_bundle
 struct _starpu_handle_list
 {
 	starpu_data_handle_t handle;
-	enum starpu_access_mode mode;
+	enum starpu_data_access_mode mode;
 	struct _starpu_handle_list *next;
 };
 
@@ -131,6 +131,6 @@ void _starpu_task_bundle_destroy(starpu_task_bundle_t bundle);
  * mode			(input)
  * 			Access mode of the handle.
  */
-void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_access_mode mode);
+void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 #endif // __CORE_TASK_BUNDLE_H__

+ 4 - 4
src/core/topology.c

@@ -496,8 +496,8 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		int worker_idx = topology->nworkers + cudagpu;
 		config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 		int devid = _starpu_get_next_cuda_gpuid(config);
-		enum starpu_perf_archtype arch =
-			(enum starpu_perf_archtype)((int)STARPU_CUDA_DEFAULT + devid);
+		enum starpu_perfmodel_archtype arch =
+			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
@@ -570,8 +570,8 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			break;
 		}
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
-		enum starpu_perf_archtype arch =
-			(enum starpu_perf_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
+		enum starpu_perfmodel_archtype arch =
+			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;

+ 18 - 15
src/core/workers.c

@@ -64,7 +64,7 @@ struct _starpu_machine_config *_starpu_get_machine_config(void)
 /* Makes sure that at least one of the workers of type <arch> can execute
  * <task>, for at least one of its implementations. */
 static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
-						      enum starpu_archtype arch)
+						      enum starpu_worker_archtype arch)
 {
 	int i;
 	int nworkers = starpu_worker_get_count();
@@ -158,7 +158,7 @@ uint32_t _starpu_can_submit_opencl_task(void)
 	return (STARPU_OPENCL & config.worker_mask);
 }
 
-static int _starpu_can_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
+static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 {
 	switch(arch)
 	{
@@ -400,6 +400,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 		STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
 		STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
+		STARPU_PTHREAD_MUTEX_INIT(&workerarg->parallel_sect_mutex, NULL);
+		STARPU_PTHREAD_COND_INIT(&workerarg->parallel_sect_cond, NULL);
+		workerarg->parallel_sect = 0;
 
 		/* if some codelet's termination cannot be handled directly :
 		 * for instance in the Gordon driver, Gordon tasks' callbacks
@@ -824,7 +827,7 @@ void starpu_profiling_init()
 
 static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 {
-	int status STARPU_ATTRIBUTE_UNUSED;
+	int status = 0;
 	unsigned workerid;
 
 	for (workerid = 0; workerid < pconfig->topology.nworkers; workerid++)
@@ -929,8 +932,8 @@ static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 
 void starpu_display_stats()
 {
-	starpu_bus_profiling_helper_display_summary();
-	starpu_worker_profiling_helper_display_summary();
+	starpu_profiling_bus_helper_display_summary();
+	starpu_profiling_worker_helper_display_summary();
 }
 
 void starpu_shutdown(void)
@@ -963,8 +966,8 @@ void starpu_shutdown(void)
 	     }
 	}
 
-	starpu_bus_profiling_helper_display_summary();
-	starpu_worker_profiling_helper_display_summary();
+	starpu_profiling_bus_helper_display_summary();
+	starpu_profiling_worker_helper_display_summary();
 
 	_starpu_deinitialize_registered_performance_models();
 
@@ -976,7 +979,7 @@ void starpu_shutdown(void)
 	     if (stats != 0)
 	     {
 		  // Display statistics on data which have not been unregistered
-		  starpu_memory_display_stats();
+		  starpu_data_display_memory_stats();
 	     }
 	}
 
@@ -1017,7 +1020,7 @@ unsigned starpu_worker_get_count(void)
 	return config.topology.nworkers;
 }
 
-int starpu_worker_get_count_by_type(enum starpu_archtype type)
+int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 {
 	switch (type)
 	{
@@ -1172,12 +1175,12 @@ struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 	return &config.combined_workers[id - basic_worker_count];
 }
 
-enum starpu_archtype starpu_worker_get_type(int id)
+enum starpu_worker_archtype starpu_worker_get_type(int id)
 {
 	return config.workers[id].arch;
 }
 
-int starpu_worker_get_ids_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
@@ -1199,7 +1202,7 @@ int starpu_worker_get_ids_by_type(enum starpu_archtype type, int *workerids, int
 	return cnt;
 }
 
-int starpu_worker_get_by_type(enum starpu_archtype type, int num)
+int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
@@ -1220,7 +1223,7 @@ int starpu_worker_get_by_type(enum starpu_archtype type, int num)
 	return -1;
 }
 
-int starpu_worker_get_by_devid(enum starpu_archtype type, int devid)
+int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
@@ -1259,7 +1262,7 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 	*sched_mutex = &config.workers[workerid].sched_mutex;
 }
 
-int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
@@ -1281,7 +1284,7 @@ int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, in
 	return cnt;
 }
 
-int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 {
 	unsigned nworkers = starpu_worker_get_count();
 

+ 14 - 5
src/core/workers.h

@@ -47,9 +47,9 @@ struct _starpu_worker
 {
 	struct _starpu_machine_config *config;
         starpu_pthread_mutex_t mutex;
-	enum starpu_archtype arch; /* what is the type of worker ? */
+	enum starpu_worker_archtype arch; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
-	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
+	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	unsigned devid; /* which cpu/gpu/etc is controlled by the worker ? */
 	int bindid; /* which cpu is the driver bound to ? (logical index) */
@@ -85,6 +85,15 @@ struct _starpu_worker
 	unsigned active_ctx;
 
 	unsigned removed_from_ctx[STARPU_NMAX_SCHED_CTXS];
+
+	/* conditions variables used when parallel sections are executed in contexts */
+	starpu_pthread_cond_t parallel_sect_cond;
+	starpu_pthread_mutex_t parallel_sect_mutex;
+
+	/* boolean indicating that workers should block in order to allow
+	   parallel sections to be executed on their allocated resources */
+	unsigned parallel_sect;
+
 #ifdef __GLIBC__
 	cpu_set_t initial_cpu_set;
 	cpu_set_t current_cpu_set;
@@ -97,7 +106,7 @@ struct _starpu_worker
 
 struct _starpu_combined_worker
 {
-	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
+	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	int worker_size;
 	unsigned memory_node; /* which memory node is associated that worker to ? */
@@ -237,10 +246,10 @@ void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status);
 /* We keep an initial sched ctx which might be used in case no other ctx is available */
 struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void);
 
-int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, int maxsize);
+int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 /* returns workers not belonging to any context, be careful no mutex is used, 
    the list might not be updated */
-int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *workerids, int maxsize);
+int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 #endif // __WORKERS_H__

+ 16 - 16
src/datawizard/coherency.c

@@ -115,7 +115,7 @@ unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destinati
 /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
 void _starpu_update_data_state(starpu_data_handle_t handle,
 			       struct _starpu_data_replicate *requesting_replicate,
-			       enum starpu_access_mode mode)
+			       enum starpu_data_access_mode mode)
 {
 	/* There is nothing to do for relaxed coherency modes (scratch or
 	 * reductions) */
@@ -223,7 +223,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
  * src_nodes, dst_nodes and handling_nodes arrays. */
 static int determine_request_path(starpu_data_handle_t handle,
 				  unsigned src_node, unsigned dst_node,
-				  enum starpu_access_mode mode, int max_len,
+				  enum starpu_data_access_mode mode, int max_len,
 				  unsigned *src_nodes, unsigned *dst_nodes,
 				  unsigned *handling_nodes)
 {
@@ -279,7 +279,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_access_mode mode, unsigned is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
 {
 	struct _starpu_data_request *r;
 
@@ -308,11 +308,11 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 				replicate->handle->busy_count++;
 			}
 
-			r->mode = (enum starpu_access_mode) ((int) r->mode | (int) STARPU_R);
+			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_R);
 		}
 
 		if (mode & STARPU_W)
-			r->mode = (enum starpu_access_mode) ((int) r->mode | (int)  STARPU_W);
+			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
 	}
 
 	return r;
@@ -342,7 +342,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, unsigned is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg)
 {
@@ -480,7 +480,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_access_mode mode, unsigned detached, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg)
 {
 	unsigned local_node = _starpu_memory_node_get_local_key();
@@ -513,12 +513,12 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_
         return ret;
 }
 
-static int prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
+static int prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode)
 {
 	return _starpu_fetch_data_on_node(handle, replicate, mode, 1, 1, NULL, NULL);
 }
 
-static int fetch_data(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_access_mode mode)
+static int fetch_data(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode)
 {
 	return _starpu_fetch_data_on_node(handle, replicate, mode, 0, 0, NULL, NULL);
 }
@@ -593,7 +593,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
 
 		if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 			continue;
@@ -607,7 +607,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 	return 0;
 }
 
-static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_access_mode mode, int workerid, unsigned local_memory_node)
+static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned local_memory_node)
 {
 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 		return &handle->per_worker[workerid];
@@ -625,7 +625,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
 
-	struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
+	struct starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
 	unsigned nbuffers = task->cl->nbuffers;
 
 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
@@ -637,7 +637,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	{
 		int ret;
 		starpu_data_handle_t handle = descrs[index].handle;
-		enum starpu_access_mode mode = descrs[index].mode;
+		enum starpu_data_access_mode mode = descrs[index].mode;
 
 		struct _starpu_data_replicate *local_replicate;
 
@@ -658,7 +658,7 @@ int _starpu_fetch_task_input(struct _starpu_job *j, uint32_t mask)
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
-		enum starpu_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
+		enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(task->cl, index);
 
 		struct _starpu_data_replicate *local_replicate;
 
@@ -700,7 +700,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	if (profiling && task->profiling_info)
 		_starpu_clock_gettime(&task->profiling_info->release_data_start_time);
 
-        struct starpu_buffer_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
+        struct starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
         unsigned nbuffers = task->cl->nbuffers;
 
 	int workerid = starpu_worker_get_id();
@@ -710,7 +710,7 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 	for (index = 0; index < nbuffers; index++)
 	{
 		starpu_data_handle_t handle = descrs[index].handle;
-		enum starpu_access_mode mode = descrs[index].mode;
+		enum starpu_data_access_mode mode = descrs[index].mode;
 
 		struct _starpu_data_replicate *local_replicate;
 

+ 5 - 5
src/datawizard/coherency.h

@@ -106,7 +106,7 @@ struct _starpu_data_state
 	 * the req_list anymore), i.e. the number of holders of the
 	 * current_mode rwlock */
 	unsigned refcnt;
-	enum starpu_access_mode current_mode;
+	enum starpu_data_access_mode current_mode;
 	/* protect meta data */
 	struct _starpu_spinlock header_lock;
 
@@ -162,7 +162,7 @@ struct _starpu_data_state
 	 * it would modify the piece of data ? Any task accessing the data in a
 	 * read-only mode should depend on that task implicitely if the
 	 * sequential_consistency flag is enabled. */
-	enum starpu_access_mode last_submitted_mode;
+	enum starpu_data_access_mode last_submitted_mode;
 	struct starpu_task *last_submitted_writer;
 	struct _starpu_task_wrapper_list *last_submitted_readers;
 
@@ -222,7 +222,7 @@ void _starpu_display_msi_stats(void);
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate,
-			       enum starpu_access_mode mode, unsigned detached, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg);
 /* This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -230,7 +230,7 @@ void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t def
 
 void _starpu_update_data_state(starpu_data_handle_t handle,
 			       struct _starpu_data_replicate *requesting_replicate,
-			       enum starpu_access_mode mode);
+			       enum starpu_data_access_mode mode);
 
 uint32_t _starpu_get_data_refcnt(struct _starpu_data_state *state, unsigned node);
 
@@ -253,7 +253,7 @@ unsigned _starpu_select_src_node(struct _starpu_data_state *state, unsigned dest
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, unsigned is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg);
 

+ 3 - 3
src/datawizard/data_request.c

@@ -85,7 +85,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 struct _starpu_data_replicate *src_replicate,
 							 struct _starpu_data_replicate *dst_replicate,
 							 unsigned handling_node,
-							 enum starpu_access_mode mode,
+							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned is_prefetch)
 {
@@ -229,7 +229,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 {
 	unsigned do_delete = 0;
 	starpu_data_handle_t handle = r->handle;
-	enum starpu_access_mode mode = r->mode;
+	enum starpu_data_access_mode mode = r->mode;
 
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
@@ -341,7 +341,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
-	enum starpu_access_mode r_mode = r->mode;
+	enum starpu_data_access_mode r_mode = r->mode;
 
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);

+ 3 - 3
src/datawizard/data_request.h

@@ -54,7 +54,7 @@ LIST_TYPE(_starpu_data_request,
 	 * With mapped buffers, an additional case is mode = 0, which means
 	 * unmapping the buffer.
 	 */
-	enum starpu_access_mode mode;
+	enum starpu_data_access_mode mode;
 
 	/* Elements needed to make the transfer asynchronous */
 	struct _starpu_async_channel async_channel;
@@ -89,7 +89,7 @@ LIST_TYPE(_starpu_data_request,
  * Not only StarPU internals, but also the application may put such requests */
 LIST_TYPE(_starpu_data_requester,
 	/* what kind of access is requested ? */
-	enum starpu_access_mode mode;
+	enum starpu_data_access_mode mode;
 
 	/* applications may also directly manipulate data */
 	unsigned is_requested_by_codelet;
@@ -120,7 +120,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 struct _starpu_data_replicate *src_replicate,
 							 struct _starpu_data_replicate *dst_replicate,
 							 unsigned handling_node,
-							 enum starpu_access_mode mode,
+							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned is_prefetch);
 

+ 1 - 1
src/datawizard/datastats.c

@@ -129,7 +129,7 @@ void _starpu_comm_amounts_inc(unsigned src  __attribute__ ((unused)), unsigned d
 void _starpu_display_comm_amounts(void)
 {
 #ifdef STARPU_DEVEL
-#  warning TODO. The information displayed here seems to be similar to the one displayed by starpu_bus_profiling_helper_display_summary()
+#  warning TODO. The information displayed here seems to be similar to the one displayed by starpu_profiling_bus_helper_display_summary()
 #endif
 
 #ifdef STARPU_ENABLE_STATS

+ 1 - 1
src/datawizard/filters.c

@@ -252,7 +252,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		child->footprint = _starpu_compute_data_footprint(child);
 
 		void *ptr;
-		ptr = starpu_handle_to_pointer(child, 0);
+		ptr = starpu_data_handle_to_pointer(child, 0);
 		if (ptr != NULL)
 		{
 			_starpu_data_register_ram_pointer(child, ptr);

+ 7 - 7
src/datawizard/footprint.c

@@ -19,7 +19,7 @@
 #include <starpu_hash.h>
 #include <core/task.h>
 
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
 	if (j->footprint_is_computed)
 		return j->footprint;
@@ -32,12 +32,12 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 	if (model && model->per_arch[arch][nimpl].size_base)
 	{
 		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
-		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
+		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
 	}
 	else if (model && model->size_base)
 	{
 		size_t size = model->size_base(task, nimpl);
-		footprint = starpu_crc32_be_n(&size, sizeof(size), footprint);
+		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
 	}
 	else
 	{
@@ -47,7 +47,7 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 
 			uint32_t handle_footprint = _starpu_data_get_footprint(handle);
 
-			footprint = starpu_crc32_be(handle_footprint, footprint);
+			footprint = starpu_hash_crc32c_be(handle_footprint, footprint);
 		}
 	}
 
@@ -59,16 +59,16 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
 {
-	uint32_t interfaceid = (uint32_t)starpu_handle_get_interface_id(handle);
+	uint32_t interfaceid = (uint32_t)starpu_data_get_interface_id(handle);
 
 	STARPU_ASSERT(handle->ops->footprint);
 
 	uint32_t handle_footprint = handle->ops->footprint(handle);
 
-	return starpu_crc32_be(handle_footprint, interfaceid);
+	return starpu_hash_crc32c_be(handle_footprint, interfaceid);
 }
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 {
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	return _starpu_compute_buffers_footprint(model, arch, nimpl, j);

+ 1 - 1
src/datawizard/footprint.h

@@ -24,7 +24,7 @@
 
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 /* Compute the footprint that characterizes the layout of the data handle. */
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);

+ 3 - 3
src/datawizard/interfaces/bcsr_interface.c

@@ -116,9 +116,9 @@ static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)
 {
 	uint32_t hash;
 
-	hash = starpu_crc32_be(starpu_bcsr_get_nnz(handle), 0);
-	hash = starpu_crc32_be(starpu_bcsr_get_c(handle), hash);
-	hash = starpu_crc32_be(starpu_bcsr_get_r(handle), hash);
+	hash = starpu_hash_crc32c_be(starpu_bcsr_get_nnz(handle), 0);
+	hash = starpu_hash_crc32c_be(starpu_bcsr_get_c(handle), hash);
+	hash = starpu_hash_crc32c_be(starpu_bcsr_get_r(handle), hash);
 
 	return hash;
 }

+ 3 - 3
src/datawizard/interfaces/block_interface.c

@@ -158,9 +158,9 @@ static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
 {
 	uint32_t hash;
 
-	hash = starpu_crc32_be(starpu_block_get_nx(handle), 0);
-	hash = starpu_crc32_be(starpu_block_get_ny(handle), hash);
-	hash = starpu_crc32_be(starpu_block_get_nz(handle), hash);
+	hash = starpu_hash_crc32c_be(starpu_block_get_nx(handle), 0);
+	hash = starpu_hash_crc32c_be(starpu_block_get_ny(handle), hash);
+	hash = starpu_hash_crc32c_be(starpu_block_get_nz(handle), hash);
 
 	return hash;
 }

+ 1 - 1
src/datawizard/interfaces/coo_interface.c

@@ -162,7 +162,7 @@ coo_interface_footprint(starpu_data_handle_t handle)
 	coo_interface = (struct starpu_coo_interface *)
 		starpu_data_get_interface_on_node(handle, 0);
 
-	return starpu_crc32_be(coo_interface->nx * coo_interface->ny, 0);
+	return starpu_hash_crc32c_be(coo_interface->nx * coo_interface->ny, 0);
 }
 
 static int

+ 1 - 1
src/datawizard/interfaces/csr_interface.c

@@ -105,7 +105,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
+	return starpu_hash_crc32c_be(starpu_csr_get_nnz(handle), 0);
 }
 
 static int csr_compare(void *data_interface_a, void *data_interface_b)

+ 94 - 13
src/datawizard/interfaces/data_interface.c

@@ -38,9 +38,22 @@ static struct handle_entry *registered_handles;
 static struct _starpu_spinlock    registered_handles_lock;
 static int _data_interface_number = STARPU_MAX_INTERFACE_ID;
 
+/* Entry in the `registered_tag_handles' hash table.  */
+struct handle_tag_entry
+{
+	UT_hash_handle hh;
+	int tag;
+	starpu_data_handle_t handle;
+};
+
+/* Hash table mapping host tags to data handles.  */
+static struct handle_tag_entry *registered_tag_handles;
+static struct _starpu_spinlock    registered_tag_handles_lock;
+
 void _starpu_data_interface_init(void)
 {
 	_starpu_spin_init(&registered_handles_lock);
+	_starpu_spin_init(&registered_tag_handles_lock);
 }
 
 void _starpu_data_interface_shutdown()
@@ -56,6 +69,18 @@ void _starpu_data_interface_shutdown()
 	}
 
 	registered_handles = NULL;
+
+	struct handle_tag_entry *tag_entry, *tag_tmp;
+
+	_starpu_spin_destroy(&registered_tag_handles_lock);
+
+	HASH_ITER(hh, registered_tag_handles, tag_entry, tag_tmp)
+	{
+		HASH_DEL(registered_tag_handles, tag_entry);
+		free(tag_entry);
+	}
+
+	registered_tag_handles = NULL;
 }
 
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
@@ -221,7 +246,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
 
-	ptr = starpu_handle_to_pointer(handle, 0);
+	ptr = starpu_data_handle_to_pointer(handle, 0);
 	if (ptr != NULL)
 	{
 		_starpu_data_register_ram_pointer(handle, ptr);
@@ -303,7 +328,7 @@ void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_hand
 	starpu_data_register(handledst, -1, local_interface, handlesrc->ops);
 }
 
-void *starpu_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
+void *starpu_data_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
 {
 	/* Check whether the operation is supported and the node has actually
 	 * been allocated.  */
@@ -316,9 +341,9 @@ void *starpu_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
 	return NULL;
 }
 
-void *starpu_handle_get_local_ptr(starpu_data_handle_t handle)
+void *starpu_data_get_local_ptr(starpu_data_handle_t handle)
 {
-	return starpu_handle_to_pointer(handle,
+	return starpu_data_handle_to_pointer(handle,
 					_starpu_memory_node_get_local_key());
 }
 
@@ -329,8 +354,8 @@ int starpu_data_get_rank(starpu_data_handle_t handle)
 
 int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 {
-        handle->rank = rank;
-        return 0;
+	handle->rank = rank;
+	return 0;
 }
 
 int starpu_data_get_tag(starpu_data_handle_t handle)
@@ -338,10 +363,64 @@ int starpu_data_get_tag(starpu_data_handle_t handle)
 	return handle->tag;
 }
 
+starpu_data_handle_t starpu_data_get_data_handle_from_tag(int tag)
+{
+	struct handle_tag_entry *ret;
+
+	_starpu_spin_lock(&registered_tag_handles_lock);
+	HASH_FIND_INT(registered_tag_handles, &tag, ret);
+	_starpu_spin_unlock(&registered_tag_handles_lock);
+
+	if (ret)
+	{
+		return ret->handle;
+	}
+	else
+	{
+		return NULL;
+	}
+}
+
 int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 {
-        handle->tag = tag;
-        return 0;
+	struct handle_tag_entry *entry;
+	entry = (struct handle_tag_entry *) malloc(sizeof(*entry));
+	STARPU_ASSERT(entry != NULL);
+
+	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),"A data handle with tag %d had already been registered.\n",tag);
+
+	entry->tag = tag;
+	entry->handle = handle;
+
+	_starpu_spin_lock(&registered_tag_handles_lock);
+	HASH_ADD_INT(registered_tag_handles, tag, entry);
+	_starpu_spin_unlock(&registered_tag_handles_lock);
+
+	handle->tag = tag;
+	return 0;
+}
+
+int starpu_data_release_tag(starpu_data_handle_t handle)
+{
+	struct handle_tag_entry *tag_entry;
+
+	if (handle->tag != -1)
+	{
+		_starpu_spin_lock(&registered_tag_handles_lock);
+		HASH_FIND_INT(registered_tag_handles, &handle->tag, tag_entry);
+		STARPU_ASSERT_MSG((tag_entry != NULL),"Handle %p with tag %d isn't in the hashmap !",handle,handle->tag);
+
+		HASH_DEL(registered_tag_handles, tag_entry);
+		free(tag_entry);
+
+		_starpu_spin_unlock(&registered_tag_handles_lock);
+	}
+	return 0;
+}
+
+struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_handle_t handle)
+{
+	return handle->ops;
 }
 
 /*
@@ -355,7 +434,7 @@ void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	unsigned worker;
 	unsigned nworkers = starpu_worker_get_count();
 
-	ram_ptr = starpu_handle_to_pointer(handle, 0);
+	ram_ptr = starpu_data_handle_to_pointer(handle, 0);
 
 	for (node = 0; node < STARPU_MAXNODES; node++)
 		free(handle->per_node[node].data_interface);
@@ -602,6 +681,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
 	STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
 
+	starpu_data_release_tag(handle);
+
 	free(handle);
 }
 
@@ -677,7 +758,7 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle)
 	starpu_data_acquire_cb(handle, STARPU_W, _starpu_data_invalidate, handle);
 }
 
-enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle)
+enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle)
 {
 	return handle->ops->interfaceid;
 }
@@ -693,13 +774,13 @@ int starpu_data_interface_get_next_id(void)
 	return _data_interface_number-1;
 }
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count)
+int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count)
 {
 	STARPU_ASSERT(handle->ops->pack_data);
 	return handle->ops->pack_data(handle, _starpu_memory_node_get_local_key(), ptr, count);
 }
 
-int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr, size_t count)
+int starpu_data_unpack(starpu_data_handle_t handle, void *ptr, size_t count)
 {
 	STARPU_ASSERT(handle->ops->unpack_data);
 	int ret;
@@ -708,7 +789,7 @@ int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr, size_t cou
 	return ret;
 }
 
-size_t starpu_handle_get_size(starpu_data_handle_t handle)
+size_t starpu_data_get_size(starpu_data_handle_t handle)
 {
 	return handle->ops->get_size(handle);
 }

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -165,7 +165,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
 
 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
+	return starpu_hash_crc32c_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
 }
 
 static int matrix_compare(void *data_interface_a, void *data_interface_b)

+ 1 - 1
src/datawizard/interfaces/multiformat_interface.c

@@ -188,7 +188,7 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
 
 static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_multiformat_get_nx(handle), 0);
+	return starpu_hash_crc32c_be(starpu_multiformat_get_nx(handle), 0);
 }
 
 static int multiformat_compare(void *data_interface_a, void *data_interface_b)

+ 1 - 1
src/datawizard/interfaces/variable_interface.c

@@ -100,7 +100,7 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, unsigned hom
 
 static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
+	return starpu_hash_crc32c_be(starpu_variable_get_elemsize(handle), 0);
 }
 
 static int variable_compare(void *data_interface_a, void *data_interface_b)

+ 1 - 1
src/datawizard/interfaces/vector_interface.c

@@ -113,7 +113,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
 
 static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle)
 {
-	return starpu_crc32_be(starpu_vector_get_nx(handle), 0);
+	return starpu_hash_crc32c_be(starpu_vector_get_nx(handle), 0);
 }
 
 static int vector_compare(void *data_interface_a, void *data_interface_b)

+ 2 - 2
src/datawizard/memalloc.c

@@ -864,7 +864,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
 	if (dst_node == 0)
 	{
-		void *ptr = starpu_handle_to_pointer(handle, 0);
+		void *ptr = starpu_data_handle_to_pointer(handle, 0);
 		if (ptr != NULL)
 		{
 			_starpu_data_register_ram_pointer(handle, ptr);
@@ -950,7 +950,7 @@ void _starpu_memory_display_stats_by_node(int node)
 }
 #endif
 
-void starpu_memory_display_stats(void)
+void starpu_data_display_memory_stats(void)
 {
 #ifdef STARPU_MEMORY_STATS
 	unsigned node;

+ 0 - 0
src/datawizard/sort_data_handles.c


Some files were not shown because too many files changed in this diff