Andra Hugo 13 vuotta sitten
vanhempi
commit
927b47d57d
71 muutettua tiedostoa jossa 1431 lisäystä ja 888 poistoa
  1. 15 2
      ChangeLog
  2. 5 1
      Makefile.am
  3. 27 33
      configure.ac
  4. 25 3
      doc/chapters/advanced-examples.texi
  5. 16 8
      doc/chapters/basic-api.texi
  6. 11 9
      doc/chapters/configuration.texi
  7. 36 15
      doc/chapters/mpi-support.texi
  8. 20 3
      doc/starpu.css
  9. 1 0
      include/starpu_config.h.in
  10. 3 7
      include/starpu_data_interfaces.h
  11. 1 2
      libstarpu.pc.in
  12. 0 30
      mpi/examples/Makefile.am
  13. 4 1
      mpi/examples/cholesky/mpi_cholesky.c
  14. 4 1
      mpi/examples/cholesky/mpi_cholesky_distributed.c
  15. 4 1
      mpi/examples/complex/mpi_complex.c
  16. 2 4
      mpi/examples/mpi_lu/plu_example.c
  17. 9 2
      mpi/examples/stencil/stencil5.c
  18. 8 4
      mpi/include/starpu_mpi.h
  19. 239 210
      mpi/src/starpu_mpi.c
  20. 104 7
      mpi/src/starpu_mpi_collective.c
  21. 1 3
      mpi/src/starpu_mpi_datatype.c
  22. 1 1
      mpi/src/starpu_mpi_datatype.h
  23. 5 4
      mpi/src/starpu_mpi_helper.c
  24. 230 132
      mpi/src/starpu_mpi_insert_task.c
  25. 15 6
      mpi/src/starpu_mpi_private.h
  26. 7 5
      mpi/src/starpu_mpi_stats.c
  27. 14 4
      mpi/tests/Makefile.am
  28. 6 13
      mpi/tests/block_interface.c
  29. 6 13
      mpi/tests/block_interface_pinned.c
  30. 4 0
      mpi/tests/helper.h
  31. 5 8
      mpi/tests/insert_task.c
  32. 4 7
      mpi/tests/insert_task_block.c
  33. 84 93
      mpi/tests/insert_task_cache.c
  34. 7 10
      mpi/tests/insert_task_owner.c
  35. 4 2
      mpi/tests/insert_task_owner2.c
  36. 4 2
      mpi/tests/insert_task_owner_data.c
  37. 7 8
      mpi/tests/mpi_detached_tag.c
  38. 7 8
      mpi/tests/mpi_irecv.c
  39. 7 8
      mpi/tests/mpi_irecv_detached.c
  40. 7 8
      mpi/tests/mpi_isend.c
  41. 7 8
      mpi/tests/mpi_isend_detached.c
  42. 30 11
      mpi/examples/reduction/mpi_reduction.c
  43. 10 0
      mpi/examples/reduction/mpi_reduction_kernels.c
  44. 18 3
      mpi/examples/scatter_gather/mpi_scatter_gather.c
  45. 7 8
      mpi/tests/mpi_test.c
  46. 5 3
      mpi/tests/multiple_send.c
  47. 9 7
      mpi/tests/pingpong.c
  48. 7 3
      mpi/tests/ring.c
  49. 7 3
      mpi/tests/ring_async.c
  50. 9 11
      mpi/tests/ring_async_implicit.c
  51. 1 0
      src/Makefile.am
  52. 4 3
      src/common/fxt.h
  53. 192 27
      src/common/utils.h
  54. 0 43
      src/core/jobs.c
  55. 0 3
      src/core/jobs.h
  56. 7 7
      src/core/perfmodel/perfmodel_bus.c
  57. 26 1
      src/core/perfmodel/perfmodel_history.c
  58. 1 2
      src/core/sched_policy.c
  59. 29 4
      src/core/task.c
  60. 7 37
      src/core/topology.c
  61. 9 9
      src/core/workers.c
  62. 1 1
      src/datawizard/interfaces/bcsr_interface.c
  63. 6 0
      src/datawizard/interfaces/coo_interface.c
  64. 2 1
      src/datawizard/interfaces/data_interface.c
  65. 1 1
      src/drivers/driver_common/driver_common.c
  66. 24 10
      src/profiling/bound.c
  67. 48 0
      src/util/misc.c
  68. 1 2
      starpu-1.0.pc.in
  69. 1 1
      starpufft/starpufft-double.h
  70. 1 1
      starpufft/starpufft-float.h
  71. 2 0
      tests/microbenchs/tasks_size_overhead.gp

+ 15 - 2
ChangeLog

@@ -36,8 +36,20 @@ New features:
   * SOCL
   * SOCL
         - Manual mapping of commands on specific devices is now possible
         - Manual mapping of commands on specific devices is now possible
   * New interface: COO matrix.
   * New interface: COO matrix.
-  * Communication statistics for MPI can also be enabled at execution
-    time by defining the environment variable STARPU_COMM_STATS
+  * MPI:
+        - Communication statistics for MPI can only be enabled at
+	  execution time by defining the environment variable
+	  STARPU_COMM_STATS
+        - Communication cache mechanism is enabled by default, and can
+	  only be disabled at execution time by setting the
+	  environment variable STARPU_MPI_CACHE to 0.
+        - Initialisation functions starpu_mpi_initialize_extended()
+  	  and starpu_mpi_initialize() have been made deprecated. One
+	  should now use starpu_mpi_init() which will initialise MPI
+	  by calling MPI_Init_Thread if is not already done.
+        - Collective detached operations have new parameters, a
+	  callback function and a argument. This is to be consistent
+	  with the detached point-to-point communications.
 
 
 Changes:
 Changes:
   * Fix the block filter functions.
   * Fix the block filter functions.
@@ -73,6 +85,7 @@ Small changes:
   * Fix forcing calibration of never-calibrated archs.
   * Fix forcing calibration of never-calibrated archs.
   * CUDA applications are no longer compiled with the "-arch sm_13"
   * CUDA applications are no longer compiled with the "-arch sm_13"
     option. It is specifically added to applications which need it.
     option. It is specifically added to applications which need it.
+  * Documentation is not built if necessary tools are missing
 
 
 StarPU 1.0.3 (svn revision 7379)
 StarPU 1.0.3 (svn revision 7379)
 ==============================================
 ==============================================

+ 5 - 1
Makefile.am

@@ -18,7 +18,11 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 
 SUBDIRS = src
 SUBDIRS = src
-SUBDIRS += tools tests doc
+SUBDIRS += tools tests
+
+if BUILD_DOC
+SUBDIRS += doc
+endif
 
 
 if USE_MPI
 if USE_MPI
 SUBDIRS += mpi
 SUBDIRS += mpi

+ 27 - 33
configure.ac

@@ -1094,14 +1094,6 @@ else
 fi
 fi
 AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
 
-# Check if user wants to disable mpi cache
-AC_ARG_ENABLE(mpi-cache, AC_HELP_STRING([--enable-mpi-cache],
-			 [Enable MPI communication cache]),
-			 enable_mpi_cache=$enableval, enable_mpi_cache=true)
-if  test x$enable_mpi_cache = xtrue; then
-    	AC_DEFINE(STARPU_MPI_CACHE, [1], [enable MPI communication cache])
-fi
-
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 running_mpi_check=no
 running_mpi_check=no
@@ -1156,18 +1148,6 @@ if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 fi
 fi
 
 
-AC_MSG_CHECKING(whether communication statistics should be generated)
-AC_ARG_ENABLE(comm-stats, [AS_HELP_STRING([--enable-comm-stats],
-			[enable communication statistics (only valid with the StarPU MPI library])],
-			enable_comm_stats=$enableval, enable_comm_stats=no)
-AC_MSG_RESULT($enable_comm_stats)
-AC_SUBST(STATS, $enable_comm_stats)
-AC_SUBST(STARPU_COMM_STATS, $enable_comm_stats)
-
-if test x$enable_comm_stats = xyes; then
-        AC_DEFINE(STARPU_COMM_STATS, [1], [enable communication statistics])
-fi
-
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                               StarPU-Top                                    #
 #                               StarPU-Top                                    #
@@ -1627,7 +1607,6 @@ AC_ARG_WITH([hwloc],
 		fi
 		fi
 	],
 	],
 	[
 	[
-	AC_MSG_NOTICE("MAYBE HWLOC")
 		use_hwloc=maybe
 		use_hwloc=maybe
 		use_hwloc_from_system=yes
 		use_hwloc_from_system=yes
 	])
 	])
@@ -1676,10 +1655,7 @@ AC_MSG_RESULT($have_valid_hwloc)
 AC_SUBST(HWLOC_REQUIRES)
 AC_SUBST(HWLOC_REQUIRES)
 
 
 # is the header file f77.h available ?
 # is the header file f77.h available ?
-have_f77_h=yes
-AC_CHECK_HEADER([f77.h],,[have_f77_h=no])
-AC_MSG_CHECKING(whether header file f77.h is available)
-AC_MSG_RESULT($have_f77_h)
+AC_CHECK_HEADER([f77.h], [have_f77_h=yes], [have_f77_h=no])
 AC_SUBST(STARPU_HAVE_F77_H, $have_f77_h)
 AC_SUBST(STARPU_HAVE_F77_H, $have_f77_h)
 AM_CONDITIONAL(STARPU_HAVE_F77_H, test x$have_f77_h = xyes)
 AM_CONDITIONAL(STARPU_HAVE_F77_H, test x$have_f77_h = xyes)
 if test x$have_f77_h = xyes; then
 if test x$have_f77_h = xyes; then
@@ -1741,6 +1717,22 @@ m4_ifdef([AM_SILENT_RULES],
   AM_CONDITIONAL([STARPU_HAVE_AM111], [true]),
   AM_CONDITIONAL([STARPU_HAVE_AM111], [true]),
   AM_CONDITIONAL([STARPU_HAVE_AM111], [false]))
   AM_CONDITIONAL([STARPU_HAVE_AM111], [false]))
 
 
+##########################################
+# Documentation                          #
+##########################################
+
+enable_build_doc=yes
+AC_CHECK_PROGS([CHECK_TEXI2DVI], [texi2dvi], "no")
+if test "$CHECK_TEXI2DVI" == "no" ; then
+    enable_build_doc=no
+else
+    AC_CHECK_PROGS([CHECK_TEX], [tex], "no")
+    if test "$CHECK_TEX" == "no" ; then
+	enable_build_doc=no
+    fi
+fi
+AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                                Final settings                               #
 #                                Final settings                               #
@@ -1835,22 +1827,24 @@ AC_MSG_NOTICE([
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	Allocation cache:  $enable_allocation_cache
 	Allocation cache:  $enable_allocation_cache
 
 
-	Magma enabled: $have_magma
-	BLAS library:  $blas_lib
-	hwloc:         $have_valid_hwloc
+	Magma enabled:     $have_magma
+	BLAS library:      $blas_lib
+	hwloc:             $have_valid_hwloc
 	FxT trace enabled: $use_fxt
 	FxT trace enabled: $use_fxt
 	StarPU-Top:        $build_starpu_top
 	StarPU-Top:        $build_starpu_top
 
 
+        Documentation:     $enable_build_doc
+        Examples:          $enable_build_examples
+
 	StarPU Extensions:
 	StarPU Extensions:
-	       MPI enabled:   $use_mpi
-	       MPI test suite: $running_mpi_check
-	       FFT Support: $fft_support
-	       GCC plug-in: $build_gcc_plugin
+	       MPI enabled:                                 $use_mpi
+	       MPI test suite:                              $running_mpi_check
+	       FFT Support:                                 $fft_support
+	       GCC plug-in:                                 $build_gcc_plugin
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
 	       SOCL enabled:  $build_socl
 	       SOCL enabled:  $build_socl
                Scheduler Hypervisor: $build_sched_ctx_hypervisor
                Scheduler Hypervisor: $build_sched_ctx_hypervisor
                SOCL test suite: $run_socl_check
                SOCL test suite: $run_socl_check
-
 ])
 ])
 
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then

+ 25 - 3
doc/chapters/advanced-examples.texi

@@ -381,13 +381,15 @@ tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
 trust the regression unless there is at least 10% difference between the minimum
 and maximum observed input size. It can be useful to set the
 and maximum observed input size. It can be useful to set the
 @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
 @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
-on varying input sizes, so as to feed the performance model for a variety of
-inputs, or to provide the measurements explictly by using
+on varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,
+so as to feed the performance model for a variety of
+inputs. The application can also provide the measurements explictly by using
 @code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
 @code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
 @code{starpu_perfmodel_plot}
 @code{starpu_perfmodel_plot}
 tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
 tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
 their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
 their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
-StarPU use the resulting performance model without recording new measures. If
+StarPU use the resulting performance model without recording new measures, and
+@code{STARPU_SCHED} can be set to @code{heft} to benefit from the performance models. If
 the data input sizes vary a lot, it is really important to set
 the data input sizes vary a lot, it is really important to set
 @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
 @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
 measures, and result with a very big performance model, which will take time a
 measures, and result with a very big performance model, which will take time a
@@ -724,6 +726,26 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 to yet more relaxed dependencies and more parallelism.
 to yet more relaxed dependencies and more parallelism.
 
 
+STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPI
+case. That will however not produce any MPI communication, but just pass
+STARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to the
+application to call @code{starpu_mpi_redux_data}, which posts tasks that will
+reduce the partial results among MPI nodes into the MPI node which owns the
+data. For instance, some hypothetical application which collects partial results
+into data @code{res}, then uses it for other computation, before looping again
+with a new reduction:
+
+@smallexample
+@{
+    for (i = 0; i < 100; i++) @{
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
+        starpu_mpi_redux_data(MPI_COMM_WORLD, res);
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+    @}
+@}
+@end smallexample
+
 @node Temporary buffers
 @node Temporary buffers
 @section Temporary buffers
 @section Temporary buffers
 
 

+ 16 - 8
doc/chapters/basic-api.texi

@@ -417,7 +417,7 @@ extensions at least to define the initial value.  For now, data to be
 used in @code{SCRATCH} mode should be registered with node @code{-1} and
 used in @code{SCRATCH} mode should be registered with node @code{-1} and
 a @code{NULL} pointer, since the value of the provided buffer is simply
 a @code{NULL} pointer, since the value of the provided buffer is simply
 ignored for now.
 ignored for now.
-@item @code{STARPU_REDUX} reduction mode.
+@item @code{STARPU_REDUX}: reduction mode. TODO!
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -465,6 +465,11 @@ This is the same as starpu_data_unregister, except that StarPU does not put back
 a valid copy into the home node, in the buffer that was initially registered.
 a valid copy into the home node, in the buffer that was initially registered.
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun void starpu_data_unregister_submit (starpu_data_handle_t @var{handle})
+Destroy the data handle once it is not needed anymore by any submitted
+task. No coherency is assumed.
+@end deftypefun
+
 @deftypefun void starpu_data_invalidate (starpu_data_handle_t @var{handle})
 @deftypefun void starpu_data_invalidate (starpu_data_handle_t @var{handle})
 Destroy all replicates of the data handle. After data invalidation, the first
 Destroy all replicates of the data handle. After data invalidation, the first
 access to the handle must be performed in write-only mode. Accessing an
 access to the handle must be performed in write-only mode. Accessing an
@@ -752,11 +757,14 @@ Return the unique identifier of the interface associated with the given @var{han
 Return the size of the data associated with @var{handle}
 Return the size of the data associated with @var{handle}
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr})
-Allocates a buffer large enough at @var{ptr} and copy to the newly
-allocated buffer the data associated to @var{handle}. The interface of
-the data registered at @var{handle} must define a packing operation
-(@pxref{struct starpu_data_interface_ops}).
+@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr}, {size_t *}@var{count})
+Execute the packing operation of the interface of the data registered
+at @var{handle} (@pxref{struct starpu_data_interface_ops}). This
+packing operation must allocate a buffer large enough at @var{ptr} and
+copy into the newly allocated buffer the data associated to
+@var{handle}.
+The function also sets @var{count} to the size of the data handle by calling
+@code{starpu_handle_get_size()}.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
 @deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
@@ -2428,7 +2436,7 @@ This function returns a pointer to device properties for worker @var{workerid}
 (assumed to be a CUDA worker).
 (assumed to be a CUDA worker).
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun size_t starpu_cuda_get_global_mem_size (int @var{devid})
+@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
 Return the size of the global memory of CUDA device @var{devid}.
 Return the size of the global memory of CUDA device @var{devid}.
 @end deftypefun
 @end deftypefun
 
 
@@ -2452,7 +2460,7 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 fails otherwise.
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun void starpu_cuda_set_device (int @var{devid})
+@deftypefun void starpu_cuda_set_device (unsigned @var{devid})
 Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
 Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
 whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
 whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
 the @code{starpu_conf} structure.
 the @code{starpu_conf} structure.

+ 11 - 9
doc/chapters/configuration.texi

@@ -172,11 +172,6 @@ enabled when the GCC compiler provides a plug-in support.
 Use the @command{mpicc} compiler at @var{path}, for starpumpi
 Use the @command{mpicc} compiler at @var{path}, for starpumpi
 (@pxref{StarPU MPI support}).
 (@pxref{StarPU MPI support}).
 
 
-@item --enable-comm-stats
-@anchor{enable-comm-stats}
-Enable communication statistics for starpumpi (@pxref{StarPU MPI
-support}).
-
 @end table
 @end table
 
 
 @node Advanced configuration
 @node Advanced configuration
@@ -425,11 +420,17 @@ THE SOCL test suite is only run when the environment variable
 of the libOpenCL.so file of the OCL ICD implementation.
 of the libOpenCL.so file of the OCL ICD implementation.
 
 
 @item @code{STARPU_COMM_STATS}
 @item @code{STARPU_COMM_STATS}
+@anchor{STARPU_COMM_STATS}
 Communication statistics for starpumpi (@pxref{StarPU MPI support})
 Communication statistics for starpumpi (@pxref{StarPU MPI support})
 will be enabled when the environment variable @code{STARPU_COMM_STATS}
 will be enabled when the environment variable @code{STARPU_COMM_STATS}
-is defined. The statistics can also be enabled by configuring StarPU
-with the option @code{--enable-comm-stats} (@pxref{enable-comm-stats}).
-
+is defined to an value other than 0.
+
+@item @code{STARPU_MPI_CACHE}
+@anchor{STARPU_MPI_CACHE}
+Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
+disabled when the environment variable @code{STARPU_MPI_CACHE} is set
+to 0. It is enabled by default or for any other values of the variable
+@code{STARPU_MPI_CACHE}.
 @end table
 @end table
 
 
 @node Misc
 @node Misc
@@ -439,7 +440,8 @@ with the option @code{--enable-comm-stats} (@pxref{enable-comm-stats}).
 
 
 @item @code{STARPU_SILENT}
 @item @code{STARPU_SILENT}
 This variable allows to disable verbose mode at runtime when StarPU
 This variable allows to disable verbose mode at runtime when StarPU
-has been configured with the option @code{--enable-verbose}.
+has been configured with the option @code{--enable-verbose}. It also
+disables the display of StarPU information and warning messages.
 
 
 @item @code{STARPU_LOGFILENAME}
 @item @code{STARPU_LOGFILENAME}
 This variable specifies in which file the debugging output should be saved to.
 This variable specifies in which file the debugging output should be saved to.

+ 36 - 15
doc/chapters/mpi-support.texi

@@ -44,25 +44,33 @@ Also pass the @code{--static} option if the application is to be linked statical
 
 
 @subsection Initialisation
 @subsection Initialisation
 
 
+@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv})
+Initializes the starpumpi library. If MPI is not already initialized,
+it will be by calling @code{MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)}.
+@end deftypefun
+
 @deftypefun int starpu_mpi_initialize (void)
 @deftypefun int starpu_mpi_initialize (void)
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions. This
-function does not call @code{MPI_Init}, it should be called beforehand.
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions.
-This function calls @code{MPI_Init}, and therefore should be prefered
-to the previous one for MPI implementations which are not thread-safe.
-Returns the current MPI node rank and world size.
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_mpi_shutdown (void)
 @deftypefun int starpu_mpi_shutdown (void)
 Cleans the starpumpi library. This must be called between calling
 Cleans the starpumpi library. This must be called between calling
-@code{starpu_mpi} functions and @code{starpu_shutdown}.
-@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
-by calling @code{starpu_mpi_initialize_extended}.
+@code{starpu_mpi} functions and @code{starpu_shutdown()}.
+@code{MPI_Finalize()} will be called if StarPU-MPI has been initialized
+by @code{starpu_mpi_init()}.
+@end deftypefun
+
+@deftypefun void starpu_mpi_comm_amounts_retrieve (size_t *@var{comm_amounts})
+Retrieve the current amount of communications from the current node in
+the array @code{comm_amounts} which must have a size greater or equal
+to the world size. Communications statistics must be enabled
+(@pxref{STARPU_COMM_STATS}).
 @end deftypefun
 @end deftypefun
 
 
 @subsection Communication
 @subsection Communication
@@ -375,8 +383,10 @@ latter receives them.
 written data back to their owners.
 written data back to their owners.
 @end enumerate
 @end enumerate
 
 
-The algorithm also includes a cache mechanism that allows not to send
-data twice to the same MPI node, unless the data has been modified.
+The algorithm also includes a communication cache mechanism that
+allows not to send data twice to the same MPI node, unless the data
+has been modified. The cache can be disabled
+(@pxref{STARPU_MPI_CACHE}).
 
 
 @end deftypefun
 @end deftypefun
 
 
@@ -469,21 +479,32 @@ each task, only the MPI node which owns the data being written to (here,
 @code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
 @code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
 automatically send the required data.
 automatically send the required data.
 
 
+This can be a concern with a growing number of nodes. To avoid this, the
+application can prune the task for loops according to the data distribution,
+so as to only submit tasks on nodes which have to care about them (either to
+execute them, or to send the required data).
+
 @node MPI Collective Operations
 @node MPI Collective Operations
 @section MPI Collective Operations
 @section MPI Collective Operations
 
 
-@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Scatter data among processes of the communicator based on the ownership of
 Scatter data among processes of the communicator based on the ownership of
 the data. For each data of the array @var{data_handles}, the
 the data. For each data of the array @var{data_handles}, the
 process @var{root} sends the data to the process owning this data.
 process @var{root} sends the data to the process owning this data.
 Processes receiving data must have valid data handles to receive them.
 Processes receiving data must have valid data handles to receive them.
+On completion of the collective communication, the @var{scallback} function is
+called with the argument @var{sarg} on the process @var{root}, the @var{rcallback} function is
+called with the argument @var{rarg} on any other process.
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Gather data from the different processes of the communicator onto the
 Gather data from the different processes of the communicator onto the
 process @var{root}. Each process owning data handle in the array
 process @var{root}. Each process owning data handle in the array
 @var{data_handles} will send them to the process @var{root}. The
 @var{data_handles} will send them to the process @var{root}. The
 process @var{root} must have valid data handles to receive the data.
 process @var{root} must have valid data handles to receive the data.
+On completion of the collective communication, the @var{rcallback} function is
+called with the argument @var{rarg} on the process @var{root}, the @var{scallback} function is
+called with the argument @var{sarg} on any other process.
 @end deftypefun
 @end deftypefun
 
 
 @page
 @page

+ 20 - 3
doc/starpu.css

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * Permission is granted to copy, distribute and/or modify this document
  * Permission is granted to copy, distribute and/or modify this document
  * under the terms of the GNU Free Documentation License, Version 1.3
  * under the terms of the GNU Free Documentation License, Version 1.3
@@ -11,7 +11,7 @@
  */
  */
 
 
 body {
 body {
-	font-size: 13px;
+	font-family: sans-serif;
 /*	margin-top: 0px; */
 /*	margin-top: 0px; */
 }
 }
 
 
@@ -39,7 +39,7 @@ div.node hr.node {
 h1 {
 h1 {
 	font: bold normal 2.5em sans-serif ;
 	font: bold normal 2.5em sans-serif ;
 	margin: 0px;
 	margin: 0px;
-	color: #0020a0;
+	color: rgb(226,0,38);
 }
 }
 h1.sub {
 h1.sub {
 	font: bold normal 2em sans-serif ;
 	font: bold normal 2em sans-serif ;
@@ -141,3 +141,20 @@ p.updated {
 	font-size: 10px;
 	font-size: 10px;
 	font-style: italic;
 	font-style: italic;
 }
 }
+
+div.contents {
+	margin-top: 12px;
+	margin-bottom: 3px;
+	font-variant: small-caps;
+	padding-left: 1em;
+	padding-right: 1em;
+	padding-top: 1px;
+	padding-bottom: 1px;
+	margin-top: 12px;
+	margin-bottom: 12px;
+	margin-top:0px;
+	margin-left: auto;
+	margin-right: auto;
+	border-top: 4px solid rgb(204,209,222);
+	border-bottom: 4px solid rgb(204,209,222);
+}

+ 1 - 0
include/starpu_config.h.in

@@ -79,6 +79,7 @@
 #undef STARPU_HAVE_LIBNUMA
 #undef STARPU_HAVE_LIBNUMA
 
 
 #undef STARPU_HAVE_WINDOWS
 #undef STARPU_HAVE_WINDOWS
+#undef STARPU_HAVE_UNSETENV
 
 
 #ifdef _MSC_VER
 #ifdef _MSC_VER
 typedef long starpu_ssize_t;
 typedef long starpu_ssize_t;

+ 3 - 7
include/starpu_data_interfaces.h

@@ -132,7 +132,7 @@ struct starpu_data_interface_ops
 	int is_multiformat;
 	int is_multiformat;
 	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 
 
-	/* Pack the data handle into a contiguous buffer at the address ptr */
+	/* Pack the data handle into a contiguous buffer at the address ptr and store the size of the buffer in count */
 	int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr);
 	int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr);
 	/* Unpack the data handle from the contiguous buffer at the address ptr */
 	/* Unpack the data handle from the contiguous buffer at the address ptr */
 	int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr);
 	int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr);
@@ -209,11 +209,7 @@ struct starpu_coo_interface
 	size_t    elemsize;
 	size_t    elemsize;
 };
 };
 
 
-void
-starpu_coo_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
-			 uint32_t nx, uint32_t ny, uint32_t n_values,
-			 uint32_t *columns, uint32_t *rows,
-			 uintptr_t values, size_t elemsize);
+void starpu_coo_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, uint32_t nx, uint32_t ny, uint32_t n_values, uint32_t *columns, uint32_t *rows, uintptr_t values, size_t elemsize);
 
 
 #define STARPU_COO_GET_COLUMNS(interface) \
 #define STARPU_COO_GET_COLUMNS(interface) \
 	(((struct starpu_coo_interface *)(interface))->columns)
 	(((struct starpu_coo_interface *)(interface))->columns)
@@ -438,7 +434,7 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, uint32_t hom
 
 
 enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
 enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
 
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr);
+int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count);
 int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr);
 int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr);
 size_t starpu_handle_get_size(starpu_data_handle_t handle);
 size_t starpu_handle_get_size(starpu_data_handle_t handle);
 
 

+ 1 - 2
libstarpu.pc.in

@@ -25,5 +25,4 @@ Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Libs.private: @LDFLAGS@ @LIBS@
-Requires: @HWLOC_REQUIRES@
-#Requires.private: @GORDON_REQUIRES@
+Requires: @HWLOC_REQUIRES@

+ 0 - 30
mpi/examples/Makefile.am

@@ -153,36 +153,6 @@ check_PROGRAMS +=					\
 	cholesky/mpi_cholesky_distributed
 	cholesky/mpi_cholesky_distributed
 endif
 endif
 
 
-########################
-# Scatter Gather       #
-########################
-
-examplebin_PROGRAMS +=		\
-	scatter_gather/mpi_scatter_gather
-
-scatter_gather_mpi_scatter_gather_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
-check_PROGRAMS +=		\
-	scatter_gather/mpi_scatter_gather
-
-###################
-# Reduction       #
-###################
-
-examplebin_PROGRAMS +=		\
-	reduction/mpi_reduction
-
-reduction_mpi_reduction_SOURCES =		\
-	reduction/mpi_reduction.c		\
-	reduction/mpi_reduction_kernels.c
-
-reduction_mpi_reduction_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
-check_PROGRAMS +=		\
-	reduction/mpi_reduction
-
 ###################
 ###################
 # complex example #
 # complex example #
 ###################
 ###################

+ 4 - 1
mpi/examples/cholesky/mpi_cholesky.c

@@ -43,7 +43,10 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	starpu_mpi_init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
 	if (dblockx == -1 || dblocky == -1)
 	if (dblockx == -1 || dblocky == -1)

+ 4 - 1
mpi/examples/cholesky/mpi_cholesky_distributed.c

@@ -42,7 +42,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
 	if (dblockx == -1 || dblocky == -1)
 	if (dblockx == -1 || dblocky == -1)

+ 4 - 1
mpi/examples/complex/mpi_complex.c

@@ -25,7 +25,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 
 
 	if (nodes < 2)
 	if (nodes < 2)
 	{
 	{

+ 2 - 4
mpi/examples/mpi_lu/plu_example.c

@@ -402,7 +402,6 @@ int main(int argc, char **argv)
 	int rank;
 	int rank;
 	int world_size;
 	int world_size;
 
 
-#if 0
 	/*
 	/*
 	 *	Initialization
 	 *	Initialization
 	 */
 	 */
@@ -415,10 +414,9 @@ int main(int argc, char **argv)
 		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
 		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
 	if (thread_support < MPI_THREAD_FUNNELED)
 	if (thread_support < MPI_THREAD_FUNNELED)
 		fprintf(stderr,"Warning: MPI does not have thread support!\n");
 		fprintf(stderr,"Warning: MPI does not have thread support!\n");
-	
+
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-#endif
 
 
 	starpu_srand48((long int)time(NULL));
 	starpu_srand48((long int)time(NULL));
 
 
@@ -430,7 +428,7 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 	starpu_data_set_default_sequential_consistency_flag(0);
 
 
-	starpu_mpi_initialize_extended(&rank, &world_size);
+	starpu_mpi_init(NULL, NULL);
 
 
 	STARPU_ASSERT(p*q == world_size);
 	STARPU_ASSERT(p*q == world_size);
 
 

+ 9 - 2
mpi/examples/stencil/stencil5.c

@@ -37,7 +37,11 @@ struct starpu_codelet stencil5_cl =
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 };
 
 
-#define NITER_DEF 500
+#ifdef STARPU_QUICK_CHECK
+#  define NITER_DEF	10
+#else
+#  define NITER_DEF	500
+#endif
 #define X         20
 #define X         20
 #define Y         20
 #define Y         20
 
 
@@ -78,7 +82,10 @@ int main(int argc, char **argv)
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&my_rank, &size);
+	starpu_mpi_init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
         parse_args(argc, argv);
         parse_args(argc, argv);
 
 
         for(x = 0; x < X; x++)
         for(x = 0; x < X; x++)

+ 8 - 4
mpi/include/starpu_mpi.h

@@ -39,8 +39,10 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
 int starpu_mpi_barrier(MPI_Comm comm);
-int starpu_mpi_initialize(void);
-int starpu_mpi_initialize_extended(int *rank, int *world_size);
+int starpu_mpi_init(int *argc, char ***argv);
+
+int starpu_mpi_initialize(void) STARPU_DEPRECATED;
+int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
 int starpu_mpi_shutdown(void);
 int starpu_mpi_shutdown(void);
 
 
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
@@ -48,8 +50,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
 
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
-int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
 
 
 /* Some helper functions */
 /* Some helper functions */
 
 
@@ -62,6 +64,8 @@ int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int s
 int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 
 
+/* retrieve the current amount of communications from the current node */
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 239 - 210
mpi/src/starpu_mpi.c

@@ -28,8 +28,11 @@
  * configuration) */
  * configuration) */
 //#define USE_STARPU_ACTIVITY	1
 //#define USE_STARPU_ACTIVITY	1
 
 
-static void submit_mpi_req(void *arg);
-static void handle_request_termination(struct _starpu_mpi_req *req);
+static void _starpu_mpi_submit_new_mpi_request(void *arg);
+static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
+#ifdef STARPU_MPI_VERBOSE
+static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
+#endif
 
 
 /* The list of requests that have been newly submitted by the application */
 /* The list of requests that have been newly submitted by the application */
 static struct _starpu_mpi_req_list *new_requests;
 static struct _starpu_mpi_req_list *new_requests;
@@ -50,52 +53,27 @@ static int running = 0;
 static pthread_mutex_t mutex_posted_requests;
 static pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 
-#define INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
+#define _STARPU_MPI_INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
 
-/*
- *	Isend
- */
 
 
-static void starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+/********************************************************/
+/*                                                      */
+/*  Send/Receive functionalities                        */
+/*                                                      */
+/********************************************************/
+
+static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
+							      int srcdst, int mpi_tag, MPI_Comm comm,
+							      unsigned detached, void (*callback)(void *), void *arg,
+							      enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
+							      enum starpu_access_mode mode)
 {
 {
-	int count;
 
 
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
-
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
-	if (req->needs_unpacking)
-		starpu_handle_pack_data(req->data_handle, &req->ptr);
-	else
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
-	STARPU_ASSERT(req->ptr);
-
-        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, count, &req->request);
-
-	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, count);
-
-        req->ret = MPI_Isend(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
-
-	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
-
-	/* somebody is perhaps waiting for the MPI request to be posted */
-	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
-	req->submitted = 1;
-	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
-}
-
-static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
-							int dest, int mpi_tag, MPI_Comm comm,
-							unsigned detached, void (*callback)(void *), void *arg)
-{
 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(req);
 	STARPU_ASSERT(req);
 
 
-        _STARPU_MPI_LOG_IN();
-
-        INC_POSTED_REQUESTS(1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
 	/* Initialize the request structure */
 	/* Initialize the request structure */
 	req->submitted = 0;
 	req->submitted = 0;
@@ -103,27 +81,74 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
 	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
 	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
 
 
-	req->request_type = SEND_REQ;
+	req->request_type = request_type;
 
 
 	req->data_handle = data_handle;
 	req->data_handle = data_handle;
-	req->srcdst = dest;
+	req->srcdst = srcdst;
 	req->mpi_tag = mpi_tag;
 	req->mpi_tag = mpi_tag;
 	req->comm = comm;
 	req->comm = comm;
-	req->func = starpu_mpi_isend_func;
 
 
 	req->detached = detached;
 	req->detached = detached;
 	req->callback = callback;
 	req->callback = callback;
 	req->callback_arg = arg;
 	req->callback_arg = arg;
 
 
+	req->func = func;
+
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, submit_mpi_req(req) is called and
+	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
 	 * the request is actually submitted  */
 	 * the request is actually submitted  */
-	starpu_data_acquire_cb(data_handle, STARPU_R, submit_mpi_req, (void *)req);
+	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
 
 
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 	return req;
 	return req;
 }
 }
 
 
+/********************************************************/
+/*                                                      */
+/*  Send functionalities                                */
+/*                                                      */
+/********************************************************/
+
+static void _starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
+	if (req->needs_unpacking)
+	{
+		starpu_handle_pack_data(req->data_handle, &req->ptr, &req->count);
+	}
+	else
+	{
+		req->count = 1;
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	}
+	STARPU_ASSERT(req->ptr);
+
+        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, req->count, &req->request);
+
+	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
+
+        req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
+
+	/* somebody is perhaps waiting for the MPI request to be posted */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	req->submitted = 1;
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
+							int dest, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg)
+{
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_func, STARPU_R);
+}
+
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
@@ -139,12 +164,8 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- *	Isend (detached)
- */
-
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
-				int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
@@ -153,26 +174,47 @@ int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- *	Irecv
- */
-
-static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
 {
 {
-	int count;
+	starpu_mpi_req req;
+	MPI_Status status;
 
 
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
+	memset(&status, 0, sizeof(MPI_Status));
 
 
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
+	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
+	starpu_mpi_wait(&req, &status);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/********************************************************/
+/*                                                      */
+/*  Receive functionalities                             */
+/*                                                      */
+/********************************************************/
+
+static void _starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
 	if (req->needs_unpacking == 1)
 	if (req->needs_unpacking == 1)
-		req->ptr = malloc(count);
+	{
+		req->count = starpu_handle_get_size(req->data_handle);
+		req->ptr = malloc(req->count);
+	}
 	else
 	else
+	{
+		req->count = 1;
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	}
 	STARPU_ASSERT(req->ptr);
 	STARPU_ASSERT(req->ptr);
 
 
 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
 
 
-        req->ret = MPI_Irecv(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	/* somebody is perhaps waiting for the MPI request to be posted */
@@ -185,37 +227,7 @@ static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
 
 
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 {
 {
-        _STARPU_MPI_LOG_IN();
-	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
-	STARPU_ASSERT(req);
-
-        INC_POSTED_REQUESTS(1);
-
-	/* Initialize the request structure */
-	req->submitted = 0;
-	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
-
-	req->request_type = RECV_REQ;
-
-	req->data_handle = data_handle;
-	req->srcdst = source;
-	req->mpi_tag = mpi_tag;
-	req->comm = comm;
-
-	req->detached = detached;
-	req->callback = callback;
-	req->callback_arg = arg;
-
-	req->func = starpu_mpi_irecv_func;
-
-	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, submit_mpi_req(req) is called and
-	 * the request is actually submitted  */
-	starpu_data_acquire_cb(data_handle, STARPU_W, submit_mpi_req, (void *)req);
-
-        _STARPU_MPI_LOG_OUT();
-	return req;
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_func, STARPU_W);
 }
 }
 
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -233,24 +245,14 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- *	Irecv (detached)
- */
-
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
-
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
-
-/*
- *	Recv
- */
-
 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 {
 {
 	starpu_mpi_req req;
 	starpu_mpi_req req;
@@ -263,30 +265,13 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- *	Send
- */
-
-int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
-{
-	starpu_mpi_req req;
-	MPI_Status status;
-
-        _STARPU_MPI_LOG_IN();
-	memset(&status, 0, sizeof(MPI_Status));
-
-	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
-	starpu_mpi_wait(&req, &status);
-
-        _STARPU_MPI_LOG_OUT();
-	return 0;
-}
-
-/*
- *	Wait
- */
+/********************************************************/
+/*                                                      */
+/*  Wait functionalities                                */
+/*                                                      */
+/********************************************************/
 
 
-static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
+static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are waiting for ? */
 	/* Which is the mpi request we are waiting for ? */
@@ -295,7 +280,7 @@ static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
-	handle_request_termination(req);
+	_starpu_mpi_handle_request_termination(req);
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
@@ -307,7 +292,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	STARPU_ASSERT(waiting_req);
 	STARPU_ASSERT(waiting_req);
 	struct _starpu_mpi_req *req = *public_req;
 	struct _starpu_mpi_req *req = *public_req;
 
 
-        INC_POSTED_REQUESTS(1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
 	 * to MPI yet. */
@@ -321,10 +306,10 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
 	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
 	waiting_req->status = status;
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->other_request = req;
-	waiting_req->func = starpu_mpi_wait_func;
+	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 	waiting_req->request_type = WAIT_REQ;
 
 
-	submit_mpi_req(waiting_req);
+	_starpu_mpi_submit_new_mpi_request(waiting_req);
 
 
 	/* We wait for the MPI request to finish */
 	/* We wait for the MPI request to finish */
 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -343,24 +328,26 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	return ret;
 	return ret;
 }
 }
 
 
-/*
- * 	Test
- */
+/********************************************************/
+/*                                                      */
+/*  Test functionalities                                */
+/*                                                      */
+/********************************************************/
 
 
-static void starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
+static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are testing for ? */
 	/* Which is the mpi request we are testing for ? */
 	struct _starpu_mpi_req *req = testing_req->other_request;
 	struct _starpu_mpi_req *req = testing_req->other_request;
 
 
-        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	if (*testing_req->flag)
 	if (*testing_req->flag)
 	{
 	{
 		testing_req->ret = req->ret;
 		testing_req->ret = req->ret;
-		handle_request_termination(req);
+		_starpu_mpi_handle_request_termination(req);
 	}
 	}
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
@@ -397,12 +384,12 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->flag = flag;
 		testing_req->flag = flag;
 		testing_req->status = status;
 		testing_req->status = status;
 		testing_req->other_request = req;
 		testing_req->other_request = req;
-		testing_req->func = starpu_mpi_test_func;
+		testing_req->func = _starpu_mpi_test_func;
 		testing_req->completed = 0;
 		testing_req->completed = 0;
                 testing_req->request_type = TEST_REQ;
                 testing_req->request_type = TEST_REQ;
 
 
-                INC_POSTED_REQUESTS(1);
-                submit_mpi_req(testing_req);
+                _STARPU_MPI_INC_POSTED_REQUESTS(1);
+                _starpu_mpi_submit_new_mpi_request(testing_req);
 
 
 		/* We wait for the test request to finish */
 		/* We wait for the test request to finish */
 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
@@ -421,7 +408,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 			free(req);
 			free(req);
 		}
 		}
 	}
 	}
-	else {
+	else
+	{
 		*flag = 0;
 		*flag = 0;
 	}
 	}
 
 
@@ -429,18 +417,20 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	return ret;
 	return ret;
 }
 }
 
 
-/*
- *	Barrier
- */
+/********************************************************/
+/*                                                      */
+/*  Barrier functionalities                             */
+/*                                                      */
+/********************************************************/
 
 
-static void starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
+static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 
 
 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
         STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
         STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
 
 
-	handle_request_termination(barrier_req);
+	_starpu_mpi_handle_request_termination(barrier_req);
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
@@ -457,7 +447,8 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
 	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
 	barrier_running = 1;
 	barrier_running = 1;
-	do {
+	do
+	{
 		while (posted_requests)
 		while (posted_requests)
 			/* Wait for all current MPI requests to finish */
 			/* Wait for all current MPI requests to finish */
 			_STARPU_PTHREAD_COND_WAIT(&cond_finished, &mutex);
 			_STARPU_PTHREAD_COND_WAIT(&cond_finished, &mutex);
@@ -477,12 +468,12 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	/* Initialize the request structure */
 	/* Initialize the request structure */
 	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
 	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
 	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
 	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
-	barrier_req->func = starpu_mpi_barrier_func;
+	barrier_req->func = _starpu_mpi_barrier_func;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->comm = comm;
 	barrier_req->comm = comm;
 
 
-        INC_POSTED_REQUESTS(1);
-	submit_mpi_req(barrier_req);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_starpu_mpi_submit_new_mpi_request(barrier_req);
 
 
 	/* We wait for the MPI request to finish */
 	/* We wait for the MPI request to finish */
 	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
@@ -497,31 +488,34 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	return ret;
 	return ret;
 }
 }
 
 
-/*
- *	Requests
- */
+/********************************************************/
+/*                                                      */
+/*  Progression                                         */
+/*                                                      */
+/********************************************************/
 
 
 #ifdef STARPU_MPI_VERBOSE
 #ifdef STARPU_MPI_VERBOSE
-static char *starpu_mpi_request_type(unsigned request_type)
+static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 {
 {
         switch (request_type)
         switch (request_type)
                 {
                 {
-                case SEND_REQ: return "send";
-                case RECV_REQ: return "recv";
-                case WAIT_REQ: return "wait";
-                case TEST_REQ: return "test";
-                case BARRIER_REQ: return "barrier";
+                case SEND_REQ: return "SEND_REQ";
+                case RECV_REQ: return "RECV_REQ";
+                case WAIT_REQ: return "WAIT_REQ";
+                case TEST_REQ: return "TEST_REQ";
+                case BARRIER_REQ: return "BARRIER_REQ";
                 default: return "unknown request type";
                 default: return "unknown request type";
                 }
                 }
 }
 }
 #endif
 #endif
 
 
-static void handle_request_termination(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 
 
-	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
-        if (req->request_type != BARRIER_REQ) {
+	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", _starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
+        if (req->request_type != BARRIER_REQ)
+	{
 		if (req->needs_unpacking)
 		if (req->needs_unpacking)
 			starpu_handle_unpack_data(req->data_handle, req->ptr);
 			starpu_handle_unpack_data(req->data_handle, req->ptr);
 		else
 		else
@@ -547,28 +541,24 @@ static void handle_request_termination(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
-static void submit_mpi_req(void *arg)
+static void _starpu_mpi_submit_new_mpi_request(void *arg)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 	struct _starpu_mpi_req *req = arg;
 
 
-        INC_POSTED_REQUESTS(-1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_starpu_mpi_req_list_push_front(new_requests, req);
 	_starpu_mpi_req_list_push_front(new_requests, req);
 	newer_requests = 1;
 	newer_requests = 1;
-        _STARPU_MPI_DEBUG("Pushing new request type %d\n", req->request_type);
+        _STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
-/*
- *	Scheduler hook
- */
-
 #ifdef USE_STARPU_ACTIVITY
 #ifdef USE_STARPU_ACTIVITY
-static unsigned progression_hook_func(void *arg __attribute__((unused)))
+static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
 {
 {
 	unsigned may_block = 1;
 	unsigned may_block = 1;
 
 
@@ -584,11 +574,7 @@ static unsigned progression_hook_func(void *arg __attribute__((unused)))
 }
 }
 #endif
 #endif
 
 
-/*
- *	Progression loop
- */
-
-static void test_detached_requests(void)
+static void _starpu_mpi_test_detached_requests(void)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	int flag;
 	int flag;
@@ -605,13 +591,13 @@ static void test_detached_requests(void)
 
 
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 
-                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 		req->ret = MPI_Test(&req->request, &flag, &status);
 		req->ret = MPI_Test(&req->request, &flag, &status);
 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 		if (flag)
 		if (flag)
 		{
 		{
-			handle_request_termination(req);
+			_starpu_mpi_handle_request_termination(req);
 		}
 		}
 
 
 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -631,13 +617,13 @@ static void test_detached_requests(void)
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
-static void handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
 {
 {
         _STARPU_MPI_LOG_IN();
         _STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(req);
 	STARPU_ASSERT(req);
 
 
 	/* submit the request to MPI */
 	/* submit the request to MPI */
-        _STARPU_MPI_DEBUG("Handling new request type %d\n", req->request_type);
+        _STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
 	req->func(req);
 	req->func(req);
 
 
 	if (req->detached)
 	if (req->detached)
@@ -657,29 +643,57 @@ static void handle_new_request(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
         _STARPU_MPI_LOG_OUT();
 }
 }
 
 
-static void *progress_thread_func(void *arg)
+struct _starpu_mpi_argc_argv
 {
 {
-        int initialize_mpi = *((int *) arg);
+	int *argc;
+	char ***argv;
+};
 
 
-        _STARPU_DEBUG("Initialize mpi: %d\n", initialize_mpi);
+static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
+{
+     switch (thread_level)
+     {
+     case MPI_THREAD_SERIALIZED:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
+	  break;
+     }
+     case MPI_THREAD_FUNNELED:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
+	  break;
+     }
+     case MPI_THREAD_SINGLE:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
+	  break;
+     }
+     }
+}
 
 
-        if (initialize_mpi) {
-#ifdef STARPU_DEVEL
-#warning get real argc and argv from the application
-#endif
-                int argc = 0;
-                char **argv = NULL;
-                int thread_support;
+static void *_starpu_mpi_progress_thread_func(void *arg)
+{
+	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
+	int flag;
+
+	MPI_Initialized(&flag);
+	_STARPU_DEBUG("MPI_Initialized %d\n", flag);
+	if (flag == 0)
+	{
+		int thread_support;
                 _STARPU_DEBUG("Calling MPI_Init_thread\n");
                 _STARPU_DEBUG("Calling MPI_Init_thread\n");
-                if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
-                        fprintf(stderr,"MPI_Init_thread failed\n");
-                        exit(1);
+		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+		{
+			_STARPU_ERROR("MPI_Init_thread failed\n");
                 }
                 }
-                if (thread_support == MPI_THREAD_FUNNELED)
-                        fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
-                if (thread_support < MPI_THREAD_FUNNELED)
-                        fprintf(stderr,"Warning: MPI does not have thread support!\n");
+		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
         }
         }
+	else
+	{
+	     int provided;
+	     MPI_Query_thread(&provided);
+	     _starpu_mpi_print_thread_level_support(provided, " has been initialized with");
+	}
 
 
 	/* notify the main thread that the progression thread is ready */
 	/* notify the main thread that the progression thread is ready */
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -688,7 +702,8 @@ static void *progress_thread_func(void *arg)
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests))) {
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
+	{
 		/* shall we block ? */
 		/* shall we block ? */
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 
 
@@ -707,7 +722,7 @@ static void *progress_thread_func(void *arg)
 
 
 		/* test whether there are some terminated "detached request" */
 		/* test whether there are some terminated "detached request" */
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-		test_detached_requests();
+		_starpu_mpi_test_detached_requests();
 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
 		/* get one request */
 		/* get one request */
@@ -721,7 +736,7 @@ static void *progress_thread_func(void *arg)
 			 * application submit requests in the meantime, so we
 			 * application submit requests in the meantime, so we
 			 * release the lock.  */
 			 * release the lock.  */
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-			handle_new_request(req);
+			_starpu_mpi_handle_new_request(req);
 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 		}
 	}
 	}
@@ -730,19 +745,23 @@ static void *progress_thread_func(void *arg)
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
         STARPU_ASSERT(posted_requests == 0);
         STARPU_ASSERT(posted_requests == 0);
 
 
-        if (initialize_mpi) {
+        if (flag == 0)
+	{
                 _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
                 _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
                 MPI_Finalize();
                 MPI_Finalize();
         }
         }
 
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
+	free(argc_argv);
 	return NULL;
 	return NULL;
 }
 }
 
 
-/*
- *	(De)Initialization methods
- */
+/********************************************************/
+/*                                                      */
+/*  (De)Initialization methods                          */
+/*                                                      */
+/********************************************************/
 
 
 #ifdef USE_STARPU_ACTIVITY
 #ifdef USE_STARPU_ACTIVITY
 static int hookid = - 1;
 static int hookid = - 1;
@@ -780,7 +799,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 }
 }
 
 
 static
 static
-int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
+int _starpu_mpi_initialize(int *argc, char ***argv)
 {
 {
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
@@ -792,20 +811,16 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 
 
         _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
         _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
 
 
-	_STARPU_PTHREAD_CREATE(&progress_thread, NULL,
-			       progress_thread_func, (void *)&initialize_mpi);
+	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
+	argc_argv->argc = argc;
+	argc_argv->argv = argv;
+	_STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (!running)
 	while (!running)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
-        if (rank && world_size) {
-                _STARPU_DEBUG("Calling MPI_Comm_rank\n");
-                MPI_Comm_rank(MPI_COMM_WORLD, rank);
-                MPI_Comm_size(MPI_COMM_WORLD, world_size);
-        }
-
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	int prank;
 	int prank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &prank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &prank);
@@ -823,14 +838,28 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 	return 0;
 	return 0;
 }
 }
 
 
+int starpu_mpi_init(int *argc, char ***argv)
+{
+        return _starpu_mpi_initialize(argc, argv);
+}
+
 int starpu_mpi_initialize(void)
 int starpu_mpi_initialize(void)
 {
 {
-        return _starpu_mpi_initialize(0, NULL, NULL);
+        return _starpu_mpi_initialize(NULL, NULL);
 }
 }
 
 
 int starpu_mpi_initialize_extended(int *rank, int *world_size)
 int starpu_mpi_initialize_extended(int *rank, int *world_size)
 {
 {
-        return _starpu_mpi_initialize(1, rank, world_size);
+	int ret;
+
+        ret = _starpu_mpi_initialize(NULL, NULL);
+	if (ret == 0)
+	{
+		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
+		MPI_Comm_rank(MPI_COMM_WORLD, rank);
+		MPI_Comm_size(MPI_COMM_WORLD, world_size);
+	}
+	return ret;
 }
 }
 
 
 int starpu_mpi_shutdown(void)
 int starpu_mpi_shutdown(void)

+ 104 - 7
mpi/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,14 +17,72 @@
 #include <mpi.h>
 #include <mpi.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
 
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+struct _callback_arg
+{
+	void (*callback)(void *);
+	void *arg;
+	int nb;
+	int count;
+};
+
+void _callback_collective(void *arg)
+{
+	struct _callback_arg *callback_arg = arg;
+	callback_arg->nb ++;
+	if (callback_arg->nb == callback_arg->count)
+	{
+		callback_arg->callback(callback_arg->arg);
+	}
+}
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 {
 {
 	int rank;
 	int rank;
 	int x;
 	int x;
+	struct _callback_arg *callback_arg;
+	void (*callback_func)(void *);
 
 
 	MPI_Comm_rank(comm, &rank);
 	MPI_Comm_rank(comm, &rank);
 
 
+#ifdef STARPU_DEVEL
+#warning callback_arg needs to be free-ed
+#endif
+	callback_func = _callback_collective;
+	callback_arg = malloc(sizeof(struct _callback_arg));
+	callback_arg->count = 0;
+	callback_arg->nb = 0;
+	callback_arg->callback = (rank == root) ? scallback : rcallback;
+	callback_arg->arg = (rank == root) ? sarg : rarg;
+	if (callback_arg->callback == NULL)
+	{
+		free(callback_arg);
+		callback_arg = NULL;
+		callback_func = NULL;
+	}
+
+	if (callback_arg)
+	{
+		for(x = 0; x < count ;  x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_data_get_tag(data_handles[x]);
+				STARPU_ASSERT(mpi_tag >= 0);
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
 	for(x = 0; x < count ;  x++)
 	for(x = 0; x < count ;  x++)
 	{
 	{
 		if (data_handles[x])
 		if (data_handles[x])
@@ -35,25 +93,64 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 			if ((rank == root) && (owner != root))
 			if ((rank == root) && (owner != root))
 			{
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
-				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
 			}
 			}
 			if ((rank != root) && (owner == rank))
 			if ((rank != root) && (owner == rank))
 			{
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
-				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
 			}
 			}
 		}
 		}
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
-int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 {
 {
 	int rank;
 	int rank;
 	int x;
 	int x;
+	struct _callback_arg *callback_arg;
+	void (*callback_func)(void *);
 
 
 	MPI_Comm_rank(comm, &rank);
 	MPI_Comm_rank(comm, &rank);
 
 
+#ifdef STARPU_DEVEL
+#warning callback_arg needs to be free-ed
+#endif
+	callback_func = _callback_collective;
+	callback_arg = malloc(sizeof(struct _callback_arg));
+	callback_arg->count = 0;
+	callback_arg->nb = 0;
+	callback_arg->callback = (rank == root) ? scallback : rcallback;
+	callback_arg->arg = (rank == root) ? sarg : rarg;
+	if (callback_arg->callback == NULL)
+	{
+		free(callback_arg);
+		callback_arg = NULL;
+		callback_func = NULL;
+	}
+
+	if (callback_arg)
+	{
+		for(x = 0; x < count ;  x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_data_get_tag(data_handles[x]);
+				STARPU_ASSERT(mpi_tag >= 0);
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
 	for(x = 0; x < count ;  x++)
 	for(x = 0; x < count ;  x++)
 	{
 	{
 		if (data_handles[x])
 		if (data_handles[x])
@@ -64,12 +161,12 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 			if ((rank == root) && (owner != root))
 			if ((rank == root) && (owner != root))
 			{
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
-				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
 			}
 			}
 			if ((rank != root) && (owner == rank))
 			if ((rank != root) && (owner == rank))
 			{
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
-				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
 			}
 			}
 		}
 		}
 	}
 	}

+ 1 - 3
mpi/src/starpu_mpi_datatype.c

@@ -127,7 +127,7 @@ static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID]
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 };
 
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count)
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 {
 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
 
 
@@ -136,13 +136,11 @@ int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype
 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
 		STARPU_ASSERT(func);
 		STARPU_ASSERT(func);
 		func(data_handle, datatype);
 		func(data_handle, datatype);
-		*count = 1;
 		return 0;
 		return 0;
 	}
 	}
 	else
 	else
 	{
 	{
 		/* The datatype is not predefined by StarPU */
 		/* The datatype is not predefined by StarPU */
-		*count = starpu_handle_get_size(data_handle);
 		*datatype = MPI_BYTE;
 		*datatype = MPI_BYTE;
 		return 1;
 		return 1;
 	}
 	}

+ 1 - 1
mpi/src/starpu_mpi_datatype.h

@@ -24,7 +24,7 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count);
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 5 - 4
mpi/src/starpu_mpi_helper.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,7 @@ int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
 {
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	*tagptr = tag;
 	*tagptr = tag;
-	
+
 	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
 	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
 						starpu_mpi_unlock_tag_callback, tagptr);
 						starpu_mpi_unlock_tag_callback, tagptr);
 }
 }
@@ -41,12 +41,13 @@ int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int s
 {
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	*tagptr = tag;
 	*tagptr = tag;
-	
+
 	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
 	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
 						starpu_mpi_unlock_tag_callback, tagptr);
 						starpu_mpi_unlock_tag_callback, tagptr);
 }
 }
 
 
-struct arg_array {
+struct arg_array
+{
 	int array_size;
 	int array_size;
 	starpu_tag_t tag;
 	starpu_tag_t tag;
 };
 };

+ 230 - 132
mpi/src/starpu_mpi_insert_task.c

@@ -28,7 +28,6 @@
 //#define STARPU_MPI_VERBOSE 1
 //#define STARPU_MPI_VERBOSE 1
 #include <starpu_mpi_private.h>
 #include <starpu_mpi_private.h>
 
 
-#ifdef STARPU_MPI_CACHE
 /* Whether we are allowed to keep copies of remote data. */
 /* Whether we are allowed to keep copies of remote data. */
 struct _starpu_data_entry
 struct _starpu_data_entry
 {
 {
@@ -36,32 +35,41 @@ struct _starpu_data_entry
 	void *data;
 	void *data;
 };
 };
 
 
-struct _starpu_data_entry **sent_data = NULL;
-struct _starpu_data_entry **received_data = NULL;
-#endif /* STARPU_MPI_CACHE */
+static struct _starpu_data_entry **sent_data = NULL;
+static struct _starpu_data_entry **received_data = NULL;
+static int cache_enabled=1;
 
 
 void _starpu_mpi_tables_init(MPI_Comm comm)
 void _starpu_mpi_tables_init(MPI_Comm comm)
 {
 {
-#ifdef STARPU_MPI_CACHE
 	int nb_nodes;
 	int nb_nodes;
 	int i;
 	int i;
 
 
+	cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
+	if (cache_enabled == -1)
+	{
+		cache_enabled = 1;
+	}
+
+	if (cache_enabled == 0)
+	{
+		if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
+		return;
+	}
+
 	MPI_Comm_size(comm, &nb_nodes);
 	MPI_Comm_size(comm, &nb_nodes);
 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
 	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
 	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
 	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
 	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
-#else
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
-#endif /* STARPU_MPI_CACHE */
 }
 }
 
 
 void _starpu_mpi_tables_free(int world_size)
 void _starpu_mpi_tables_free(int world_size)
 {
 {
-#ifdef STARPU_MPI_CACHE
 	int i;
 	int i;
 
 
+	if (cache_enabled == 0) return;
+
 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
 
 
 	for(i=0 ; i<world_size ; i++)
 	for(i=0 ; i<world_size ; i++)
@@ -80,13 +88,13 @@ void _starpu_mpi_tables_free(int world_size)
 	}
 	}
 	free(sent_data);
 	free(sent_data);
 	free(received_data);
 	free(received_data);
-#endif
 }
 }
 
 
 static
 static
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 {
 {
-	if (data && mode & STARPU_R) {
+	if (data && mode & STARPU_R)
+	{
 		struct starpu_data_interface_ops *ops;
 		struct starpu_data_interface_ops *ops;
 		int rank = starpu_data_get_rank(data);
 		int rank = starpu_data_get_rank(data);
 
 
@@ -94,8 +102,10 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 		size_on_nodes[rank] += ops->get_size(data);
 		size_on_nodes[rank] += ops->get_size(data);
 	}
 	}
 
 
-	if (mode & STARPU_W) {
-		if (!data) {
+	if (mode & STARPU_W)
+	{
+		if (!data)
+		{
 			/* We don't have anything allocated for this.
 			/* We don't have anything allocated for this.
 			 * The application knows we won't do anything
 			 * The application knows we won't do anything
 			 * about this task */
 			 * about this task */
@@ -107,25 +117,32 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_rank = starpu_data_get_rank(data);
-		if (mpi_rank == me) {
-			if (*do_execute == 0) {
+		if (mpi_rank == me)
+		{
+			if (*do_execute == 0)
+			{
 				*inconsistent_execute = 1;
 				*inconsistent_execute = 1;
 			}
 			}
-			else {
+			else
+			{
 				*do_execute = 1;
 				*do_execute = 1;
 			}
 			}
 		}
 		}
-		else if (mpi_rank != -1) {
-			if (*do_execute == 1) {
+		else if (mpi_rank != -1)
+		{
+			if (*do_execute == 1)
+			{
 				*inconsistent_execute = 1;
 				*inconsistent_execute = 1;
 			}
 			}
-			else {
+			else
+			{
 				*do_execute = 0;
 				*do_execute = 0;
 				*dest = mpi_rank;
 				*dest = mpi_rank;
 				/* That's the rank which needs the data to be sent to */
 				/* That's the rank which needs the data to be sent to */
 			}
 			}
 		}
 		}
-		else {
+		else
+		{
 			_STARPU_ERROR("rank invalid\n");
 			_STARPU_ERROR("rank invalid\n");
 		}
 		}
 	}
 	}
@@ -135,60 +152,64 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 static
 static
 void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
 void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
 {
 {
-#ifdef STARPU_MPI_CACHE
+	if (cache_enabled == 0) return NULL;
+
 	struct _starpu_data_entry *already_received;
 	struct _starpu_data_entry *already_received;
 	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
 	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-	if (already_received == NULL) {
+	if (already_received == NULL)
+	{
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		entry->data = data;
 		entry->data = data;
 		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
 		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
 	}
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 	}
 	}
 	return already_received;
 	return already_received;
-#else
-	return NULL;
-#endif
 }
 }
 
 
 static
 static
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 {
 {
-#ifdef STARPU_MPI_CACHE
+	if (cache_enabled == 0) return NULL;
+
 	struct _starpu_data_entry *already_sent;
 	struct _starpu_data_entry *already_sent;
 	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
 	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
-	if (already_sent == NULL) {
+	if (already_sent == NULL)
+	{
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		entry->data = data;
 		entry->data = data;
 		HASH_ADD_PTR(sent_data[dest], data, entry);
 		HASH_ADD_PTR(sent_data[dest], data, entry);
 		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
 		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
 	}
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
 		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
 	}
 	}
 	return already_sent;
 	return already_sent;
-#else
-	return NULL;
-#endif
 }
 }
 
 
 static
 static
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 {
 {
-	if (data && mode & STARPU_R) {
+	if (data && mode & STARPU_R)
+	{
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		if(mpi_rank == -1) {
+		if(mpi_rank == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
-		if(mpi_tag == -1) {
+		if(mpi_tag == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
 		/* The task needs to read this data */
 		/* The task needs to read this data */
-		if (do_execute && mpi_rank != me && mpi_rank != -1) {
+		if (do_execute && mpi_rank != me && mpi_rank != -1)
+		{
 			/* I will have to execute but I don't have the data, receive */
 			/* I will have to execute but I don't have the data, receive */
 			void *already_received = _starpu_mpi_already_received(data, mpi_rank);
 			void *already_received = _starpu_mpi_already_received(data, mpi_rank);
 			if (already_received == NULL)
 			if (already_received == NULL)
@@ -197,7 +218,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 			}
 			}
 		}
 		}
-		if (!do_execute && mpi_rank == me) {
+		if (!do_execute && mpi_rank == me)
+		{
 			/* Somebody else will execute it, and I have the data, send it. */
 			/* Somebody else will execute it, and I have the data, send it. */
 			void *already_sent = _starpu_mpi_already_sent(data, dest);
 			void *already_sent = _starpu_mpi_already_sent(data, dest);
 			if (already_sent == NULL)
 			if (already_sent == NULL)
@@ -212,24 +234,30 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 static
 static
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
 {
 {
-	if (mode & STARPU_W) {
+	if (mode & STARPU_W)
+	{
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		if(mpi_rank == -1) {
+		if(mpi_rank == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
-		if(mpi_tag == -1) {
+		if(mpi_tag == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			STARPU_ABORT();
 			STARPU_ABORT();
 		}
 		}
-		if (mpi_rank == me) {
-			if (xrank != -1 && me != xrank) {
+		if (mpi_rank == me)
+		{
+			if (xrank != -1 && me != xrank)
+			{
 				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
 				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
 				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
 				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
 			}
 			}
 		}
 		}
-		else if (do_execute) {
+		else if (do_execute)
+		{
 			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
 			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
 			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 		}
 		}
@@ -238,43 +266,54 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
 
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
 {
 {
-#ifdef STARPU_MPI_CACHE
-	if (mode & STARPU_W) {
-		if (do_execute) {
-			/* Note that all copies I've sent to neighbours are now invalid */
-			int n, size;
-			MPI_Comm_size(comm, &size);
-			for(n=0 ; n<size ; n++) {
-				struct _starpu_data_entry *already_sent;
-				HASH_FIND_PTR(sent_data[n], &data, already_sent);
-				if (already_sent) {
-					_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
-					HASH_DEL(sent_data[n], already_sent);
+	if (cache_enabled)
+	{
+		if (mode & STARPU_W)
+		{
+			if (do_execute)
+			{
+				/* Note that all copies I've sent to neighbours are now invalid */
+				int n, size;
+				MPI_Comm_size(comm, &size);
+				for(n=0 ; n<size ; n++)
+				{
+					struct _starpu_data_entry *already_sent;
+					HASH_FIND_PTR(sent_data[n], &data, already_sent);
+					if (already_sent)
+					{
+						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
+						HASH_DEL(sent_data[n], already_sent);
+					}
+				}
+			}
+			else
+			{
+				int mpi_rank = starpu_data_get_rank(data);
+				struct _starpu_data_entry *already_received;
+				HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+				if (already_received)
+				{
+					/* Somebody else will write to the data, so discard our cached copy if any */
+					/* TODO: starpu_mpi could just remember itself. */
+					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
+					HASH_DEL(received_data[mpi_rank], already_received);
+					starpu_data_invalidate_submit(data);
 				}
 				}
 			}
 			}
 		}
 		}
-		else {
+	}
+	else
+	{
+		/* We allocated a temporary buffer for the received data, now drop it */
+		if ((mode & STARPU_R) && do_execute)
+		{
 			int mpi_rank = starpu_data_get_rank(data);
 			int mpi_rank = starpu_data_get_rank(data);
-			struct _starpu_data_entry *already_received;
-			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-			if (already_received) {
-				/* Somebody else will write to the data, so discard our cached copy if any */
-				/* TODO: starpu_mpi could just remember itself. */
-				_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
-				HASH_DEL(received_data[mpi_rank], already_received);
+			if (mpi_rank != me && mpi_rank != -1)
+			{
 				starpu_data_invalidate_submit(data);
 				starpu_data_invalidate_submit(data);
 			}
 			}
 		}
 		}
 	}
 	}
-#else
-	/* We allocated a temporary buffer for the received data, now drop it */
-	if ((mode & STARPU_R) && do_execute) {
-		int mpi_rank = starpu_data_get_rank(data);
-		if (mpi_rank != me && mpi_rank != -1) {
-			starpu_data_invalidate_submit(data);
-		}
-	}
-#endif
 }
 }
 
 
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
@@ -307,20 +346,24 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	do_execute = -1;
 	do_execute = -1;
 	xrank = -1;
 	xrank = -1;
 	va_start(varg_list, codelet);
 	va_start(varg_list, codelet);
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_EXECUTE_ON_NODE) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			xrank = va_arg(varg_list, int);
 			xrank = va_arg(varg_list, int);
 			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
 			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
 			do_execute = 1;
 			do_execute = 1;
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			xrank = starpu_data_get_rank(data);
 			xrank = starpu_data_get_rank(data);
 			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
 			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
 			STARPU_ASSERT(xrank <= nb_nodes);
 			STARPU_ASSERT(xrank <= nb_nodes);
 			do_execute = 1;
 			do_execute = 1;
 		}
 		}
-		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
 			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
@@ -348,36 +391,45 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data ++;
 				current_data ++;
 			}
 			}
 		}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 			va_arg(varg_list, size_t);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
 	}
 	}
 	va_end(varg_list);
 	va_end(varg_list);
 
 
-	if (do_execute == -1) {
+	if (do_execute == -1)
+	{
 		int i;
 		int i;
 		size_t max_size = 0;
 		size_t max_size = 0;
-		for(i=0 ; i<nb_nodes ; i++) {
+		for(i=0 ; i<nb_nodes ; i++)
+		{
 			if (size_on_nodes[i] > max_size)
 			if (size_on_nodes[i] > max_size)
 			{
 			{
 				max_size = size_on_nodes[i];
 				max_size = size_on_nodes[i];
@@ -385,7 +437,8 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			}
 			}
 		}
 		}
 		free(size_on_nodes);
 		free(size_on_nodes);
-		if (xrank != -1) {
+		if (xrank != -1)
+		{
 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
 			do_execute = 1;
 			do_execute = 1;
 		}
 		}
@@ -393,18 +446,22 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 
 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 
 
-	if (inconsistent_execute == 1) {
-		if (xrank == -1) {
+	if (inconsistent_execute == 1)
+	{
+		if (xrank == -1)
+		{
 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
 			free(size_on_nodes);
 			free(size_on_nodes);
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
-		else {
+		else
+		{
 			do_execute = (me == xrank);
 			do_execute = (me == xrank);
 			dest = xrank;
 			dest = xrank;
 		}
 		}
 	}
 	}
-	else if (xrank != -1) {
+	else if (xrank != -1)
+	{
 		do_execute = (me == xrank);
 		do_execute = (me == xrank);
 		dest = xrank;
 		dest = xrank;
 	}
 	}
@@ -412,8 +469,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	/* Send and receive data as requested */
 	/* Send and receive data as requested */
 	va_start(varg_list, codelet);
 	va_start(varg_list, codelet);
 	current_data = 0;
 	current_data = 0;
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
 
@@ -433,33 +492,41 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data++;
 				current_data++;
 			}
 			}
 		}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 			va_arg(varg_list, size_t);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, starpu_data_handle_t);
 			va_arg(varg_list, starpu_data_handle_t);
 		}
 		}
 	}
 	}
 	va_end(varg_list);
 	va_end(varg_list);
 
 
-	if (do_execute) {
+	if (do_execute)
+	{
 		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		va_start(varg_list, codelet);
 		va_start(varg_list, codelet);
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
@@ -468,11 +535,14 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		STARPU_ASSERT(ret==0);
 		STARPU_ASSERT(ret==0);
 	}
 	}
 
 
-	if (inconsistent_execute) {
+	if (inconsistent_execute)
+	{
 		va_start(varg_list, codelet);
 		va_start(varg_list, codelet);
 		current_data = 0;
 		current_data = 0;
-		while ((arg_type = va_arg(varg_list, int)) != 0) {
-			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+		while ((arg_type = va_arg(varg_list, int)) != 0)
+		{
+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+			{
 				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
 
@@ -491,27 +561,34 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 					current_data++;
 					current_data++;
 				}
 				}
 			}
 			}
-			else if (arg_type==STARPU_VALUE) {
+			else if (arg_type==STARPU_VALUE)
+			{
 				va_arg(varg_list, void *);
 				va_arg(varg_list, void *);
 				va_arg(varg_list, size_t);
 				va_arg(varg_list, size_t);
 			}
 			}
-			else if (arg_type==STARPU_CALLBACK) {
+			else if (arg_type==STARPU_CALLBACK)
+			{
 				va_arg(varg_list, void (*)(void *));
 				va_arg(varg_list, void (*)(void *));
 			}
 			}
-			else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+			{
 				va_arg(varg_list, void (*)(void *));
 				va_arg(varg_list, void (*)(void *));
 				va_arg(varg_list, void *);
 				va_arg(varg_list, void *);
 			}
 			}
-			else if (arg_type==STARPU_CALLBACK_ARG) {
+			else if (arg_type==STARPU_CALLBACK_ARG)
+			{
 				va_arg(varg_list, void *);
 				va_arg(varg_list, void *);
 			}
 			}
-			else if (arg_type==STARPU_PRIORITY) {
+			else if (arg_type==STARPU_PRIORITY)
+			{
 				va_arg(varg_list, int);
 				va_arg(varg_list, int);
 			}
 			}
-			else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			else if (arg_type==STARPU_EXECUTE_ON_NODE)
+			{
 				va_arg(varg_list, int);
 				va_arg(varg_list, int);
 			}
 			}
-			else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			else if (arg_type==STARPU_EXECUTE_ON_DATA)
+			{
 				va_arg(varg_list, starpu_data_handle_t);
 				va_arg(varg_list, starpu_data_handle_t);
 			}
 			}
 		}
 		}
@@ -520,8 +597,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 
 	va_start(varg_list, codelet);
 	va_start(varg_list, codelet);
 	current_data = 0;
 	current_data = 0;
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
 
@@ -540,27 +619,34 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data++;
 				current_data++;
 			}
 			}
 		}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 			va_arg(varg_list, size_t);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, void *);
 		}
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 			va_arg(varg_list, int);
 		}
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, starpu_data_handle_t);
 			va_arg(varg_list, starpu_data_handle_t);
 		}
 		}
 	}
 	}
@@ -576,11 +662,13 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
 
 	rank = starpu_data_get_rank(data_handle);
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
@@ -604,11 +692,13 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
 
 	rank = starpu_data_get_rank(data_handle);
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
@@ -633,11 +723,13 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 
 	rank = starpu_data_get_rank(data_handle);
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
@@ -648,11 +740,14 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
 	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
 
 
 	// need to count how many nodes have the data in redux mode
 	// need to count how many nodes have the data in redux mode
-	if (me == rank) {
+	if (me == rank)
+	{
 		int i;
 		int i;
 
 
-		for(i=0 ; i<nb_nodes ; i++) {
-			if (i != rank) {
+		for(i=0 ; i<nb_nodes ; i++)
+		{
+			if (i != rank)
+			{
 				starpu_data_handle_t new_handle;
 				starpu_data_handle_t new_handle;
 
 
 				starpu_data_register_same(&new_handle, data_handle);
 				starpu_data_register_same(&new_handle, data_handle);
@@ -661,14 +756,17 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 
 				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
 				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
 				starpu_insert_task(data_handle->redux_cl,
 				starpu_insert_task(data_handle->redux_cl,
-						STARPU_RW, data_handle,
-						STARPU_R, new_handle,
-						0);
+						   STARPU_RW, data_handle,
+						   STARPU_R, new_handle,
+						   0);
+				starpu_data_unregister_submit(new_handle);
 			}
 			}
 		}
 		}
 	}
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
 		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+		starpu_insert_task(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 	}
 }
 }

+ 15 - 6
mpi/src/starpu_mpi_private.h

@@ -41,6 +41,11 @@ extern "C" {
 #  define _STARPU_MPI_DEBUG(fmt, args ...)
 #  define _STARPU_MPI_DEBUG(fmt, args ...)
 #endif
 #endif
 
 
+#define _STARPU_MPI_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
+                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
+
 #ifdef STARPU_MPI_VERBOSE0
 #ifdef STARPU_MPI_VERBOSE0
 #  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
 #  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
                                                int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
                                                int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
@@ -55,11 +60,14 @@ extern "C" {
 #  define _STARPU_MPI_LOG_OUT()
 #  define _STARPU_MPI_LOG_OUT()
 #endif
 #endif
 
 
-#define SEND_REQ	0
-#define RECV_REQ	1
-#define WAIT_REQ        2
-#define TEST_REQ        3
-#define BARRIER_REQ     4
+enum _starpu_mpi_request_type
+{
+	SEND_REQ=0,
+	RECV_REQ=1,
+	WAIT_REQ=2,
+	TEST_REQ=3,
+	BARRIER_REQ=4
+};
 
 
 LIST_TYPE(_starpu_mpi_req,
 LIST_TYPE(_starpu_mpi_req,
 	/* description of the data at StarPU level */
 	/* description of the data at StarPU level */
@@ -68,6 +76,7 @@ LIST_TYPE(_starpu_mpi_req,
 	/* description of the data to be sent/received */
 	/* description of the data to be sent/received */
 	MPI_Datatype datatype;
 	MPI_Datatype datatype;
 	void *ptr;
 	void *ptr;
+	size_t count;
 	int needs_unpacking;
 	int needs_unpacking;
 
 
 	/* who are we talking to ? */
 	/* who are we talking to ? */
@@ -85,7 +94,7 @@ LIST_TYPE(_starpu_mpi_req,
 	pthread_mutex_t req_mutex;
 	pthread_mutex_t req_mutex;
 	pthread_cond_t req_cond;
 	pthread_cond_t req_cond;
 
 
-	unsigned request_type; /* 0 send, 1 recv */
+	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
 
 
 	unsigned submitted;
 	unsigned submitted;
 	unsigned completed;
 	unsigned completed;

+ 7 - 5
mpi/src/starpu_mpi_stats.c

@@ -27,19 +27,15 @@ static int stats_enabled=0;
 
 
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 {
 {
-#ifdef STARPU_COMM_STATS
-	stats_enabled = 1;
-#else
 	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
 	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
 	if (stats_enabled == -1)
 	if (stats_enabled == -1)
 	{
 	{
 		stats_enabled = 0;
 		stats_enabled = 0;
 	}
 	}
-#endif /* STARPU_COMM_STATS */
 
 
 	if (stats_enabled == 0) return;
 	if (stats_enabled == 0) return;
 
 
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats or is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
 
 	MPI_Comm_size(comm, &world_size);
 	MPI_Comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
 	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
@@ -67,12 +63,18 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 	comm_amount[dst] += count*size;
 	comm_amount[dst] += count*size;
 }
 }
 
 
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
+{
+	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
+}
+
 void _starpu_mpi_comm_amounts_display(int node)
 void _starpu_mpi_comm_amounts_display(int node)
 {
 {
 	unsigned dst;
 	unsigned dst;
 	size_t sum = 0;
 	size_t sum = 0;
 
 
 	if (stats_enabled == 0) return;
 	if (stats_enabled == 0) return;
+	if (getenv("STARPU_SILENT")) return;
 
 
 	for (dst = 0; dst < world_size; dst++)
 	for (dst = 0; dst < world_size; dst++)
 	{
 	{

+ 14 - 4
mpi/tests/Makefile.am

@@ -19,9 +19,9 @@ CCLD=$(MPICC)
 
 
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-LOG_COMPILER	 	=	$(MPIEXEC) -np 2
+LOG_COMPILER	 	=	$(MPIEXEC) -np 4
 else
 else
-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 4
 endif
 endif
 TESTS			=	$(check_PROGRAMS)
 TESTS			=	$(check_PROGRAMS)
 endif
 endif
@@ -78,7 +78,9 @@ check_PROGRAMS +=				\
 	insert_task_owner			\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
 	insert_task_owner_data			\
-	multiple_send
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction
 
 
 noinst_PROGRAMS =				\
 noinst_PROGRAMS =				\
 	pingpong				\
 	pingpong				\
@@ -99,7 +101,9 @@ noinst_PROGRAMS =				\
 	insert_task_owner			\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
 	insert_task_owner_data			\
-	multiple_send
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction
 
 
 mpi_isend_LDADD =					\
 mpi_isend_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -139,6 +143,10 @@ insert_task_owner_data_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 multiple_send_LDADD =				\
 multiple_send_LDADD =				\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_scatter_gather_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_reduction_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 
 ring_SOURCES = ring.c
 ring_SOURCES = ring.c
 ring_async_SOURCES = ring_async.c
 ring_async_SOURCES = ring_async.c
@@ -148,6 +156,8 @@ ring_SOURCES += ring_kernel.cu
 ring_async_SOURCES += ring_kernel.cu
 ring_async_SOURCES += ring_kernel.cu
 ring_async_implicit_SOURCES += ring_kernel.cu
 ring_async_implicit_SOURCES += ring_kernel.cu
 endif
 endif
+mpi_reduction_SOURCES = mpi_reduction.c
+mpi_reduction_SOURCES += mpi_reduction_kernels.c
 
 
 showcheck:
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	-cat $(TEST_LOGS) /dev/null

+ 6 - 13
mpi/tests/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,23 +35,16 @@ int main(int argc, char **argv)
 	if (size < 2)
 	if (size < 2)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
-	/* We only use 2 nodes for that test */
-	if (rank >= 2)
-	{
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
-
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -79,7 +72,7 @@ int main(int argc, char **argv)
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			SIZE, SIZE, SIZE, sizeof(float));
 			SIZE, SIZE, SIZE, sizeof(float));
 	}
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 	{
 		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
 		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
 		assert(block);
 		assert(block);
@@ -112,7 +105,7 @@ int main(int argc, char **argv)
 		starpu_data_release(block_handle);
 		starpu_data_release(block_handle);
 
 
 	}
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 	{
 		MPI_Status status;
 		MPI_Status status;
 		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
 		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);

+ 6 - 13
mpi/tests/block_interface_pinned.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,23 +35,16 @@ int main(int argc, char **argv)
 	if (size < 2)
 	if (size < 2)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
-	/* We only use 2 nodes for that test */
-	if (rank >= 2)
-	{
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
-
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -80,7 +73,7 @@ int main(int argc, char **argv)
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			SIZE, SIZE, SIZE, sizeof(float));
 			SIZE, SIZE, SIZE, sizeof(float));
 	}
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 	{
 		starpu_malloc((void **)&block,
 		starpu_malloc((void **)&block,
 			SIZE*SIZE*SIZE*sizeof(float));
 			SIZE*SIZE*SIZE*sizeof(float));
@@ -113,7 +106,7 @@ int main(int argc, char **argv)
 		starpu_data_release(block_handle);
 		starpu_data_release(block_handle);
 
 
 	}
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 	{
 		MPI_Status status;
 		MPI_Status status;
 
 

+ 4 - 0
mpi/tests/helper.h

@@ -19,4 +19,8 @@
 #define STARPU_TEST_SKIPPED 77
 #define STARPU_TEST_SKIPPED 77
 
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define FPRINTF_MPI(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+                                                fprintf(stderr, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
 
 

+ 5 - 8
mpi/tests/insert_task.c

@@ -54,8 +54,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
         for(x = 0; x < X; x++)
         for(x = 0; x < X; x++)
 	{
 	{
@@ -85,17 +87,12 @@ int main(int argc, char **argv)
                                 //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                         }
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+                        else
 			{
 			{
                                 /* I don't own that index, but will need it for my computations */
                                 /* I don't own that index, but will need it for my computations */
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                         }
                         }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
                         if (data_handles[x][y])
                         if (data_handles[x][y])
 			{
 			{
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);

+ 4 - 7
mpi/tests/insert_task_block.c

@@ -71,8 +71,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_mpi_init(&argc, &argv);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
         for(x = 0; x < SIZE; x++)
         for(x = 0; x < SIZE; x++)
 	{
 	{
@@ -103,18 +105,13 @@ int main(int argc, char **argv)
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+                        else
 			{
 			{
                                 /* I don't own that index, but will need it for my computations */
                                 /* I don't own that index, but will need it for my computations */
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
                         }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
                         if (data_handles[x][y])
                         if (data_handles[x][y])
 			{
 			{
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);

+ 84 - 93
mpi/tests/insert_task_cache.c

@@ -14,17 +14,21 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <starpu.h>
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include <math.h>
 #include <math.h>
 #include "helper.h"
 #include "helper.h"
 
 
-void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+#if !defined(STARPU_HAVE_UNSETENV)
+#warning unsetenv is not defined. Skipping test
+int main(int argc, char **argv)
 {
 {
-	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	return STARPU_TEST_SKIPPED;
+}
+#else
 
 
-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
-        *x = (*x + *y) / 2;
+void func_cpu(__attribute__ ((unused)) void *descr[], __attribute__ ((unused)) void *_args)
+{
 }
 }
 
 
 struct starpu_codelet mycodelet =
 struct starpu_codelet mycodelet =
@@ -35,118 +39,105 @@ struct starpu_codelet mycodelet =
 	.modes = {STARPU_RW, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R}
 };
 };
 
 
-#define X     4
-#define Y     5
+#define N     1000
 
 
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes)
+int my_distrib(int x)
 {
 {
-        return x % nb_nodes;
+        return x;
 }
 }
 
 
-
-int main(int argc, char **argv)
+void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 {
 {
-        int rank, size, x, y;
-        int ret,value=0;
-        unsigned matrix[X][Y];
-        starpu_data_handle_t data_handles[X][Y];
+        int i;
+        int ret;
+	unsigned v[2][N];
+        starpu_data_handle_t data_handles[2];
+	char *string;
+
+	string = malloc(50);
+	sprintf(string, "STARPU_MPI_CACHE=%d", enabled);
+	putenv(string);
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 2; i++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		int mpi_rank = my_distrib(i);
+		if (mpi_rank == rank)
 		{
 		{
-                        matrix[x][y] = (rank+1)*10 + value;
-                        value++;
-                }
-        }
-#if 0
-        for(x = 0; x < X; x++)
-	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++)
+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
+		}
+		else
 		{
 		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
+			/* I don't own that index, but will need it for my computations */
+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
+		}
+		starpu_data_set_rank(data_handles[i], mpi_rank);
+		starpu_data_set_tag(data_handles[i], i);
         }
         }
-#endif
 
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 5; i++)
 	{
 	{
-                for (y = 0; y < Y; y++)
-		{
-                        int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank)
-			{
-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
-                        }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
-			{
-                                /* I don't own that index, but will need it for my computations */
-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
-                        }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
-                        if (data_handles[x][y])
-			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
-			}
-                }
-        }
-
-	mycodelet.name = "codelet1";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
 
 
-	mycodelet.name = "codelet2";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet3";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet4";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
 
 
-        FPRINTF(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
         starpu_task_wait_for_all();
 
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 2; i++)
 	{
 	{
-                for (y = 0; y < Y; y++)
-		{
-                        if (data_handles[x][y])
-                                starpu_data_unregister(data_handles[x][y]);
-                }
+		starpu_data_unregister(data_handles[i]);
         }
         }
+
+	starpu_mpi_comm_amounts_retrieve(comm_amount);
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
+}
 
 
-#if 0
-        for(x = 0; x < X; x++)
-	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++)
-		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
-        }
-#endif
+int main(int argc, char **argv)
+{
+	int dst, rank, size;
+	int result=0;
+	size_t *comm_amount_with_cache;
+	size_t *comm_amount_without_cache;
+	char *string;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	string = malloc(50);
+	sprintf(string, "STARPU_COMM_STATS=1");
+	putenv(string);
+
+	comm_amount_with_cache = malloc(size * sizeof(size_t));
+	comm_amount_without_cache = malloc(size * sizeof(size_t));
 
 
-	return 0;
+	test_cache(rank, size, 0, comm_amount_with_cache);
+	test_cache(rank, size, 1, comm_amount_without_cache);
+
+	if (rank == 0 || rank == 1)
+	{
+		dst = (rank == 0) ? 1 : 0;
+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
+		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
+	}
+	else
+		result = 1;
+
+	MPI_Finalize();
+	return !result;
 }
 }
+#endif

+ 7 - 10
mpi/tests/insert_task_owner.c

@@ -79,16 +79,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
-
-        if (size != 2)
-	{
-		if (rank == 0) FPRINTF(stderr, "We need exactly 2 processes.\n");
-                starpu_mpi_shutdown();
-                starpu_shutdown();
-                return STARPU_TEST_SKIPPED;
-        }
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
         if (rank == 0)
         if (rank == 0)
 	{
 	{
@@ -109,6 +103,8 @@ int main(int argc, char **argv)
 		starpu_data_set_tag(data_handlesx0, 0);
 		starpu_data_set_tag(data_handlesx0, 0);
         }
         }
 
 
+	if (rank != 0 && rank != 1) goto end;
+
 	node = starpu_data_get_rank(data_handlesx1);
 	node = starpu_data_get_rank(data_handlesx1);
         err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
         err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
@@ -170,6 +166,7 @@ int main(int argc, char **argv)
 				     0);
 				     0);
         assert(err == 0);
         assert(err == 0);
 
 
+end:
 	fprintf(stderr, "Waiting ...\n");
 	fprintf(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
         starpu_task_wait_for_all();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 4 - 2
mpi/tests/insert_task_owner2.c

@@ -55,8 +55,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
         if (rank == 0)
         if (rank == 0)
 	{
 	{

+ 4 - 2
mpi/tests/insert_task_owner_data.c

@@ -45,8 +45,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
         if (rank == 0)
         if (rank == 0)
 	{
 	{

+ 7 - 8
mpi/tests/mpi_detached_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -52,14 +52,13 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
 		starpu_tag_t tag = (starpu_tag_t)loop;
 		starpu_tag_t tag = (starpu_tag_t)loop;
 
 
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 		{
 			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
 			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
 		}
 		}

+ 7 - 8
mpi/tests/mpi_irecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -52,12 +52,11 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
 		}

+ 7 - 8
mpi/tests/mpi_irecv_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,10 +47,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -58,8 +58,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -67,12 +67,11 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
-		if (rank == 0)
+		if ((loop % 2) == (rank%2))
 		{
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
 		}

+ 7 - 8
mpi/tests/mpi_isend.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -52,12 +52,11 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 		{
 			MPI_Status status;
 			MPI_Status status;
 			starpu_mpi_req req;
 			starpu_mpi_req req;

+ 7 - 8
mpi/tests/mpi_isend_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,10 +47,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -58,8 +58,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -67,12 +67,11 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
-		if (rank == 0)
+		if ((loop % 2) == (rank%2))
 		{
 		{
 			int sent = 0;
 			int sent = 0;
 			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
 			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);

+ 30 - 11
mpi/examples/reduction/mpi_reduction.c

@@ -20,6 +20,7 @@
 extern void init_cpu_func(void *descr[], void *cl_arg);
 extern void init_cpu_func(void *descr[], void *cl_arg);
 extern void redux_cpu_func(void *descr[], void *cl_arg);
 extern void redux_cpu_func(void *descr[], void *cl_arg);
 extern void dot_cpu_func(void *descr[], void *cl_arg);
 extern void dot_cpu_func(void *descr[], void *cl_arg);
+extern void display_cpu_func(void *descr[], void *cl_arg);
 
 
 static struct starpu_codelet init_codelet =
 static struct starpu_codelet init_codelet =
 {
 {
@@ -46,6 +47,15 @@ static struct starpu_codelet dot_codelet =
 	.name = "dot_codelet"
 	.name = "dot_codelet"
 };
 };
 
 
+static struct starpu_codelet display_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {display_cpu_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.name = "display_codelet"
+};
+
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int nb_nodes)
 int my_distrib(int x, int nb_nodes)
 {
 {
@@ -54,20 +64,24 @@ int my_distrib(int x, int nb_nodes)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int my_rank, size, x, y;
+	int my_rank, size, x, y, i;
         long int *vector;
         long int *vector;
 	long int dot, sum=0;
 	long int dot, sum=0;
         starpu_data_handle_t *handles;
         starpu_data_handle_t *handles;
 	starpu_data_handle_t dot_handle;
 	starpu_data_handle_t dot_handle;
 
 
-	int nb_elements, step;
+	int nb_elements, step, loops;
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&my_rank, &size);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
 	nb_elements = size*8000;
 	nb_elements = size*8000;
 	step = 4;
 	step = 4;
+	loops = 5;
 
 
 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
         for(x = 0; x < nb_elements; x+=step)
         for(x = 0; x < nb_elements; x+=step)
@@ -84,7 +98,8 @@ int main(int argc, char **argv)
 	if (my_rank == 0) {
 	if (my_rank == 0) {
 		dot = 14;
 		dot = 14;
 		sum = (nb_elements * (nb_elements + 1)) / 2;
 		sum = (nb_elements * (nb_elements + 1)) / 2;
-		sum+= dot;
+		sum *= loops;
+		sum += dot;
 		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
 		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
 	}
 	}
 	else
 	else
@@ -117,15 +132,19 @@ int main(int argc, char **argv)
 	starpu_data_set_tag(dot_handle, nb_elements+1);
 	starpu_data_set_tag(dot_handle, nb_elements+1);
 	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
 	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
 
 
-	for (x = 0; x < nb_elements; x+=step)
+	for (i = 0; i < loops; i++)
 	{
 	{
-		starpu_mpi_insert_task(MPI_COMM_WORLD,
-				       &dot_codelet,
-				       STARPU_R, handles[x],
-				       STARPU_REDUX, dot_handle,
-				       0);
+		for (x = 0; x < nb_elements; x+=step)
+		{
+			starpu_mpi_insert_task(MPI_COMM_WORLD,
+					       &dot_codelet,
+					       STARPU_R, handles[x],
+					       STARPU_REDUX, dot_handle,
+					       0);
+		}
+		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
 	}
 	}
-	starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
 
 
         fprintf(stderr, "Waiting ...\n");
         fprintf(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
         starpu_task_wait_for_all();

+ 10 - 0
mpi/examples/reduction/mpi_reduction_kernels.c

@@ -64,3 +64,13 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 //	_DISPLAY("After dot=%ld\n", *dot);
 //	_DISPLAY("After dot=%ld\n", *dot);
 }
 }
 
 
+/*
+ *	Display codelet
+ */
+void display_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	_DISPLAY("Local=%ld\n", *local_x);
+}
+

+ 18 - 3
mpi/examples/scatter_gather/mpi_scatter_gather.c

@@ -54,6 +54,18 @@ static struct starpu_codelet cl =
 	.modes = {STARPU_RW},
 	.modes = {STARPU_RW},
 };
 };
 
 
+void scallback(void *arg __attribute__((unused)))
+{
+	char *msg = arg;
+	fprintf(stderr, "Sending completed for <%s>\n", msg);
+}
+
+void rcallback(void *arg __attribute__((unused)))
+{
+	char *msg = arg;
+	fprintf(stderr, "Reception completed for <%s>\n", msg);
+}
+
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
         int rank, nodes;
         int rank, nodes;
@@ -69,7 +81,10 @@ int main(int argc, char **argv)
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 
 
 	if (rank == 0)
 	if (rank == 0)
 	{
 	{
@@ -152,7 +167,7 @@ int main(int argc, char **argv)
         }
         }
 
 
 	/* Scatter the matrix among the nodes */
 	/* Scatter the matrix among the nodes */
-	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
 
 
 	/* Calculation */
 	/* Calculation */
 	for(x = 0; x < nblocks*nblocks ;  x++)
 	for(x = 0; x < nblocks*nblocks ;  x++)
@@ -172,7 +187,7 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 	/* Gather the matrix on main node */
 	/* Gather the matrix on main node */
-	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "gather", rcallback, "gather");
 
 
 	/* Unregister matrix from StarPU */
 	/* Unregister matrix from StarPU */
 	for(x=0 ; x<nblocks*nblocks ; x++)
 	for(x=0 ; x<nblocks*nblocks ; x++)

+ 7 - 8
mpi/tests/mpi_test.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -52,14 +52,13 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
 		starpu_mpi_req req;
 		starpu_mpi_req req;
 
 
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 		{
                         starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
                         starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 		}
 		}

+ 5 - 3
mpi/tests/multiple_send.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,8 +30,10 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
 	if (size < 2)
 	if (size < 2)
 	{
 	{

+ 9 - 7
mpi/tests/pingpong.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 	{
 		if (rank == 0)
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 
 		MPI_Finalize();
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
 
 
@@ -52,17 +52,19 @@ int main(int argc, char **argv)
 
 
 	unsigned nloops = NITER;
 	unsigned nloops = NITER;
 	unsigned loop;
 	unsigned loop;
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
 	{
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 		{
+			//FPRINTF_MPI("Sending to %d\n", other_rank);
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
 		}
 		else
 		else
 		{
 		{
 			MPI_Status status;
 			MPI_Status status;
+			//FPRINTF_MPI("Receiving from %d\n", other_rank);
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 		}
 		}
 	}
 	}

+ 7 - 3
mpi/tests/ring.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 
 unsigned token = 42;
 unsigned token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
@@ -75,8 +79,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 
 

+ 7 - 3
mpi/tests/ring_async.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 
 unsigned token = 42;
 unsigned token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
@@ -75,8 +79,8 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 
 

+ 9 - 11
mpi/tests/ring_async_implicit.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 
 unsigned token = 42;
 unsigned token = 42;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
@@ -59,16 +63,12 @@ int main(int argc, char **argv)
 {
 {
 	int ret, rank, size;
 	int ret, rank, size;
 
 
-#if 0
-	MPI_Init(NULL, NULL);
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
-#endif
-
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
 	if (size < 2)
 	if (size < 2)
 	{
 	{
@@ -121,8 +121,6 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-        //MPI_Finalize();
-
 	if (rank == last_rank)
 	if (rank == last_rank)
 	{
 	{
                 FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
                 FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);

+ 1 - 0
src/Makefile.am

@@ -198,6 +198,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	util/execute_on_all.c					\
 	util/execute_on_all.c					\
 	util/starpu_create_sync_task.c				\
 	util/starpu_create_sync_task.c				\
 	util/file.c						\
 	util/file.c						\
+	util/misc.c						\
 	util/starpu_data_cpy.c					\
 	util/starpu_data_cpy.c					\
 	util/starpu_insert_task.c				\
 	util/starpu_insert_task.c				\
 	util/starpu_insert_task_utils.c				\
 	util/starpu_insert_task_utils.c				\

+ 4 - 3
src/common/fxt.h

@@ -29,6 +29,7 @@
 #include <sys/types.h>
 #include <sys/types.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <common/config.h>
 #include <common/config.h>
+#include <common/utils.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
 /* some key to identify the worker kind */
 /* some key to identify the worker kind */
@@ -201,7 +202,7 @@ do {									\
 
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
 do {									\
-        const char *model_name = _starpu_get_model_name((job));         \
+        const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
 	if (model_name)                                                 \
 	{								\
 	{								\
 		/* we include the symbol name */			\
 		/* we include the symbol name */			\
@@ -261,7 +262,7 @@ do {									\
 #define _STARPU_TRACE_TASK_DONE(job)						\
 #define _STARPU_TRACE_TASK_DONE(job)						\
 do {										\
 do {										\
 	unsigned exclude_from_dag = (job)->exclude_from_dag;			\
 	unsigned exclude_from_dag = (job)->exclude_from_dag;			\
-        const char *model_name = _starpu_get_model_name((job));                       \
+        const char *model_name = _starpu_job_get_model_name((job));                       \
 	if (model_name)					                        \
 	if (model_name)					                        \
 	{									\
 	{									\
 		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
 		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
@@ -274,7 +275,7 @@ do {										\
 #define _STARPU_TRACE_TAG_DONE(tag)						\
 #define _STARPU_TRACE_TAG_DONE(tag)						\
 do {										\
 do {										\
         struct _starpu_job *job = (tag)->job;                                  \
         struct _starpu_job *job = (tag)->job;                                  \
-        const char *model_name = _starpu_get_model_name((job));                       \
+        const char *model_name = _starpu_job_get_model_name((job));                       \
 	if (model_name)                                                         \
 	if (model_name)                                                         \
 	{									\
 	{									\
           _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \
           _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \

+ 192 - 27
src/common/utils.h

@@ -43,7 +43,7 @@
 #  define _STARPU_LOG_OUT_TAG(outtag)
 #  define _STARPU_LOG_OUT_TAG(outtag)
 #endif
 #endif
 
 
-#define _STARPU_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+#define _STARPU_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); }} while(0)
 #define _STARPU_ERROR(fmt, args ...)                                                  \
 #define _STARPU_ERROR(fmt, args ...)                                                  \
 	do {                                                                          \
 	do {                                                                          \
                 fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args);        \
                 fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args);        \
@@ -61,6 +61,14 @@ char *_starpu_get_home_path(void);
 /* If FILE is currently on a comment line, eat it.  */
 /* If FILE is currently on a comment line, eat it.  */
 void _starpu_drop_comments(FILE *f);
 void _starpu_drop_comments(FILE *f);
 
 
+struct _starpu_job;
+/* Returns the symbol associated to that job if any. */
+const char *_starpu_job_get_model_name(struct _starpu_job *j);
+
+struct starpu_codelet;
+/* Returns the symbol associated to that job if any. */
+const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl);
+
 #define _STARPU_PTHREAD_CREATE(thread, attr, routine, arg) do {                \
 #define _STARPU_PTHREAD_CREATE(thread, attr, routine, arg) do {                \
 	int p_ret = pthread_create((thread), (attr), (routine), (arg));	       \
 	int p_ret = pthread_create((thread), (attr), (routine), (arg));	       \
 	if (STARPU_UNLIKELY(p_ret != 0)) {                                     \
 	if (STARPU_UNLIKELY(p_ret != 0)) {                                     \
@@ -70,7 +78,10 @@ void _starpu_drop_comments(FILE *f);
 	}                                                                      \
 	}                                                                      \
 } while (0)
 } while (0)
 
 
-#define _STARPU_PTHREAD_MUTEX_INIT(mutex, attr) {                              \
+/*
+ * Encapsulation of the pthread_mutex_* functions.
+ */
+#define _STARPU_PTHREAD_MUTEX_INIT(mutex, attr) do {                           \
 	int p_ret = pthread_mutex_init((mutex), (attr));                       \
 	int p_ret = pthread_mutex_init((mutex), (attr));                       \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
 		fprintf(stderr,                                                \
@@ -78,8 +89,9 @@ void _starpu_drop_comments(FILE *f);
 			__FILE__, __LINE__, strerror(p_ret));                  \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 		STARPU_ABORT();                                                \
 	}                                                                      \
 	}                                                                      \
-}
-#define _STARPU_PTHREAD_MUTEX_DESTROY(mutex) {                                 \
+} while (0)
+
+#define _STARPU_PTHREAD_MUTEX_DESTROY(mutex) do {                              \
 	int p_ret = pthread_mutex_destroy(mutex);                              \
 	int p_ret = pthread_mutex_destroy(mutex);                              \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
 		fprintf(stderr,                                                \
@@ -87,45 +99,198 @@ void _starpu_drop_comments(FILE *f);
 			__FILE__, __LINE__, strerror(p_ret));                  \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 		STARPU_ABORT();                                                \
 	}                                                                      \
 	}                                                                      \
-}
-#define _STARPU_PTHREAD_MUTEX_LOCK(mutex) {                                    \
+} while(0)
+
+#define _STARPU_PTHREAD_MUTEX_LOCK(mutex) do {                                 \
 	int p_ret = pthread_mutex_lock(mutex);                                 \
 	int p_ret = pthread_mutex_lock(mutex);                                 \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
 		fprintf(stderr,                                                \
-			"%s:%d pthread_mutex_lock : %s\n",                     \
+			"%s:%d pthread_mutex_lock: %s\n",                      \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 		STARPU_ABORT();                                                \
 	}                                                                      \
 	}                                                                      \
-}
+} while (0)
 
 
-#define _STARPU_PTHREAD_MUTEX_UNLOCK(mutex) {                                  \
+#define _STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
 	int p_ret = pthread_mutex_unlock(mutex);                               \
 	int p_ret = pthread_mutex_unlock(mutex);                               \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
 		fprintf(stderr,                                                \
-			"%s:%d pthread_mutex_unlock : %s\n",                   \
+			"%s:%d pthread_mutex_unlock: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_rwlock_* functions.
+ */
+#define _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) do {                         \
+	int p_ret = pthread_rwlock_init((rwlock), (attr));                     \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_init: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_RDLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_rdlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_rdlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_wrlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_wrlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_unlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_unlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_DESTROY(rwlock) do {                            \
+	int p_ret = pthread_rwlock_destroy(rwlock);                            \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_destroy: %s\n",                  \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_cond_* functions.
+ */
+#define _STARPU_PTHREAD_COND_INIT(cond, attr) do {                             \
+	int p_ret = pthread_cond_init((cond), (attr));                         \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_init: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_DESTROY(cond) do {                                \
+	int p_ret = pthread_cond_destroy(cond);                                \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_destroy: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+			STARPU_ABORT();                                        \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_SIGNAL(cond) do {                                 \
+	int p_ret = pthread_cond_signal(cond);                                 \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_signal: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_BROADCAST(cond) do {                              \
+	int p_ret = pthread_cond_broadcast(cond);                              \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_broadcast: %s\n",                  \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_WAIT(cond, mutex) do {                            \
+	int p_ret = pthread_cond_wait((cond), (mutex));                        \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_wait: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_barrier_* functions.
+ */
+#define _STARPU_PTHREAD_BARRIER_INIT(barrier, attr, count) do {                \
+	int p_ret = pthread_barrier_init((barrier), (attr), (count));          \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_init: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_BARRIER_DESTROY(barrier) do {                          \
+	int p_ret = pthread_barrier_destroy((barrier));                        \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_destroy: %s\n",                 \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 		STARPU_ABORT();                                                \
 	}                                                                      \
 	}                                                                      \
-}
+} while (0)
 
 
-#define _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) { int p_ret = pthread_rwlock_init((rwlock), (attr)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_RDLOCK(rwlock) { int p_ret = pthread_rwlock_rdlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_rdlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) { int p_ret = pthread_rwlock_wrlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_wrlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) { int p_ret = pthread_rwlock_unlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_unlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_DESTROY(rwlock) { int p_ret = pthread_rwlock_destroy(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             \
+	int p_ret = pthread_barrier_wait(barrier);                             \
+	if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_wait: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+			STARPU_ABORT();                                        \
+	}                                                                      \
+} while (0)
 
 
-#define _STARPU_PTHREAD_COND_INIT(cond, attr) { int p_ret = pthread_cond_init((cond), (attr)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_DESTROY(cond) { int p_ret = pthread_cond_destroy(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_SIGNAL(cond) { int p_ret = pthread_cond_signal(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_signal : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_BROADCAST(cond) { int p_ret = pthread_cond_broadcast(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_broadcast : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_WAIT(cond, mutex) { int p_ret = pthread_cond_wait((cond), (mutex)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_wait : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+/*
+ * Encapsulation of the pthread_spin_* functions.
+ */
+#define _STARPU_PTHREAD_SPIN_DESTROY(lock) do {                                \
+	int p_ret = pthread_spin_destroy(lock);                                \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_destroy: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
 
-#define _STARPU_PTHREAD_BARRIER_INIT(barrier, attr, count) { int p_ret = pthread_barrier_init((barrier), (attr), (count)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_barrier_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_BARRIER_DESTROY(barrier) { int p_ret = pthread_barrier_destroy((barrier)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_barrier_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_BARRIER_WAIT(barrier) { int p_ret = pthread_barrier_wait(barrier); if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { fprintf(stderr, "pthread_barrier_wait : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_SPIN_LOCK(lock) do {                                   \
+	int p_ret = pthread_spin_lock(lock);                                   \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_lock: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
 
-#define _STARPU_PTHREAD_SPIN_DESTROY(lock) { int p_ret = pthread_spin_destroy(lock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_SPIN_LOCK(lock) { int p_ret = pthread_spin_lock(lock);  if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_lock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_SPIN_UNLOCK(lock) { int p_ret = pthread_spin_unlock(lock);  if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_unlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_SPIN_UNLOCK(lock) do {                                 \
+	int p_ret = pthread_spin_unlock(lock);                                 \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_unlock: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
 
 #endif // __COMMON_UTILS_H__
 #endif // __COMMON_UTILS_H__

+ 0 - 43
src/core/jobs.c

@@ -29,28 +29,6 @@
 #include <starpu_top.h>
 #include <starpu_top.h>
 #include <top/starpu_top_core.h>
 #include <top/starpu_top_core.h>
 
 
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
-{
-	struct starpu_task *task = j->task;
-
-	if (model && model->per_arch[arch][nimpl].size_base) {
-		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
-	} else if (model && model->size_base) {
-		return model->size_base(task, nimpl);
-	} else {
-		unsigned nbuffers = task->cl->nbuffers;
-		size_t size = 0;
-
-		unsigned buffer;
-		for (buffer = 0; buffer < nbuffers; buffer++)
-		{
-			starpu_data_handle_t handle = task->handles[buffer];
-			size += _starpu_data_get_size(handle);
-		}
-		return size;
-	}
-}
-
 /* we need to identify each task to generate the DAG. */
 /* we need to identify each task to generate the DAG. */
 static unsigned job_cnt = 0;
 static unsigned job_cnt = 0;
 
 
@@ -420,24 +398,3 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
 
 	return 0;
 	return 0;
 }
 }
-
-const char *_starpu_get_model_name(struct _starpu_job *j)
-{
-	if (!j)
-		return NULL;
-
-	struct starpu_task *task = j->task;
-        if (task && task->cl) {
-            if (task->cl->model && task->cl->model->symbol)
-                return task->cl->model->symbol;
-	    else
-		return task->cl->name;
-	} else
-	{
-#ifdef STARPU_USE_FXT
-                return j->model_name;
-#else
-                return NULL;
-#endif
-        }
-}

+ 0 - 3
src/core/jobs.h

@@ -171,7 +171,4 @@ struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker);
  * enforce a FIFO ordering. */
  * enforce a FIFO ordering. */
 int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 
 
-/* Returns the symbol associated to that job if any. */
-const char *_starpu_get_model_name(struct _starpu_job *j);
-
 #endif // __JOBS_H__
 #endif // __JOBS_H__

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -1010,7 +1010,7 @@ static int load_bus_bandwidth_file_content(void)
 			n = fscanf(f, "%lf", &bandwidth);
 			n = fscanf(f, "%lf", &bandwidth);
 			if (n != 1)
 			if (n != 1)
 			{
 			{
-				fprintf(stderr,"Error while reading sampling file <%s>. Expected a number\n", path);
+				_STARPU_DISP("Error while reading sampling file <%s>. Expected a number\n", path);
 				fclose(f);
 				fclose(f);
 				return 0;
 				return 0;
 			}
 			}
@@ -1267,21 +1267,21 @@ static void check_bus_config_file(void)
                 // Checking if both configurations match
                 // Checking if both configurations match
                 if (read_cpus != ncpus)
                 if (read_cpus != ncpus)
 		{
 		{
-			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
+			_STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
                 }
                 else if (read_cuda != ncuda)
                 else if (read_cuda != ncuda)
 		{
 		{
-                        fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
+                        _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
                 }
                 else if (read_opencl != nopencl)
                 else if (read_opencl != nopencl)
 		{
 		{
-                        fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
+                        _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
                 }
         }
         }
 }
 }

+ 26 - 1
src/core/perfmodel/perfmodel_history.c

@@ -53,6 +53,28 @@ struct starpu_perfmodel_history_table
 static pthread_rwlock_t registered_models_rwlock;
 static pthread_rwlock_t registered_models_rwlock;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 
 
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
+{
+	struct starpu_task *task = j->task;
+
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+	} else if (model && model->size_base) {
+		return model->size_base(task, nimpl);
+	} else {
+		unsigned nbuffers = task->cl->nbuffers;
+		size_t size = 0;
+
+		unsigned buffer;
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+			size += _starpu_data_get_size(handle);
+		}
+		return size;
+	}
+}
+
 /*
 /*
  * History based model
  * History based model
  */
  */
@@ -1067,7 +1089,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			exp = entry->history_entry->mean;
 			exp = entry->history_entry->mean;
 		else if (!model->benchmarking)
 		else if (!model->benchmarking)
 		{
 		{
-			_STARPU_DISP("Warning: model %s is not calibrated enough, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol);
+			char archname[32];
+
+			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
 			_starpu_set_calibrate_flag(1);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 			model->benchmarking = 1;
 		}
 		}

+ 1 - 2
src/core/sched_policy.c

@@ -426,8 +426,7 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 		}
 		}
 #endif
 #endif
 		default:
 		default:
-			fprintf(stderr, "Oops : %u\n", handle->mf_node);
-			STARPU_ABORT();
+			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 		}
 		}
 		break;
 		break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 29 - 4
src/core/task.c

@@ -262,7 +262,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* CPU */
 	/* CPU */
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS && cl->cpu_funcs[0])
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS && cl->cpu_funcs[0])
 	{
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both cpu_func and cpu_funcs are set. Ignoring cpu_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both cpu_func and cpu_funcs are set. Ignoring cpu_func.\n");
 		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
 		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
 	}
 	}
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS)
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS)
@@ -282,7 +282,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* CUDA */
 	/* CUDA */
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS && cl->cuda_funcs[0])
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS && cl->cuda_funcs[0])
 	{
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both cuda_func and cuda_funcs are set. Ignoring cuda_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both cuda_func and cuda_funcs are set. Ignoring cuda_func.\n");
 		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
 		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
 	}
 	}
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS)
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS)
@@ -302,7 +302,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* OpenCL */
 	/* OpenCL */
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS && cl->opencl_funcs[0])
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS && cl->opencl_funcs[0])
 	{
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both opencl_func and opencl_funcs are set. Ignoring opencl_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both opencl_func and opencl_funcs are set. Ignoring opencl_func.\n");
 		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
 		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
 	}
 	}
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS)
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS)
@@ -344,7 +344,7 @@ void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 		{
 		{
 			if (task->buffers[i].handle && task->handles[i])
 			if (task->buffers[i].handle && task->handles[i])
 			{
 			{
-				fprintf(stderr, "[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
+				_STARPU_DISP("[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
 				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
 				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
 				STARPU_ABORT();
 				STARPU_ABORT();
 			}
 			}
@@ -370,6 +370,7 @@ int starpu_task_submit(struct starpu_task *task)
 //	   0 : starpu_get_sched_ctx());
 //	   0 : starpu_get_sched_ctx());
 	int ret;
 	int ret;
 	unsigned is_sync = task->synchronous;
 	unsigned is_sync = task->synchronous;
+	starpu_task_bundle_t bundle = task->bundle;
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
 	if (is_sync)
 	if (is_sync)
@@ -449,6 +450,30 @@ int starpu_task_submit(struct starpu_task *task)
 		}
 		}
 	}
 	}
 
 
+	if (bundle)
+	{
+		/* We need to make sure that models for other tasks of the
+		 * bundle are also loaded, so the scheduler can estimate the
+		 * duration of the whole bundle */
+		_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
+
+		struct _starpu_task_bundle_entry *entry;
+		entry = bundle->list;
+
+		while (entry)
+		{
+			if (entry->task->cl->model && entry->task->cl->model->symbol)
+				_starpu_load_perfmodel(entry->task->cl->model);
+
+			if (entry->task->cl->power_model && entry->task->cl->power_model->symbol)
+				_starpu_load_perfmodel(entry->task->cl->power_model);
+
+			entry = entry->next;
+		}
+
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+	}
+
 	/* If profiling is activated, we allocate a structure to store the
 	/* If profiling is activated, we allocate a structure to store the
 	 * appropriate info. */
 	 * appropriate info. */
 	struct starpu_task_profiling_info *info;
 	struct starpu_task_profiling_info *info;

+ 7 - 37
src/core/topology.c

@@ -471,10 +471,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			{
 			{
 				/* The user requires more CUDA devices than
 				/* The user requires more CUDA devices than
 				 * there is available */
 				 * there is available */
-				fprintf(stderr,
-					"# Warning: %d CUDA devices "
-					"requested. Only %d available.\n",
-					ncuda, nb_devices);
+				_STARPU_DISP("Warning: %d CUDA devices requested. Only %d available.\n", ncuda, nb_devices);
 				ncuda = nb_devices;
 				ncuda = nb_devices;
 			}
 			}
 		}
 		}
@@ -527,14 +524,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			nopencl = nb_devices;
 			nopencl = nb_devices;
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			{
 			{
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"available. Only %d enabled. "
-					"Use configure option "
-					"--enable-maxopencldadev=xxx to "
-					"update the maximum value of "
-					"supported OpenCL devices.\n",
-					nb_devices, STARPU_MAXOPENCLDEVS);
+				_STARPU_DISP("Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldadev=xxx to update the maximum value of supported OpenCL devices.\n", nb_devices, STARPU_MAXOPENCLDEVS);
 				nopencl = STARPU_MAXOPENCLDEVS;
 				nopencl = STARPU_MAXOPENCLDEVS;
 			}
 			}
 		}
 		}
@@ -545,23 +535,13 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			{
 			{
 				/* The user requires more OpenCL devices than
 				/* The user requires more OpenCL devices than
 				 * there is available */
 				 * there is available */
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"requested. Only %d available.\n",
-					nopencl, nb_devices);
+				_STARPU_DISP("Warning: %d OpenCL devices requested. Only %d available.\n", nopencl, nb_devices);
 				nopencl = nb_devices;
 				nopencl = nb_devices;
 			}
 			}
 			/* Let's make sure this value is OK. */
 			/* Let's make sure this value is OK. */
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			{
 			{
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"requested. Only %d enabled. Use "
-					"configure option "
-					"--enable-maxopencldev=xxx to update "
-					"the maximum value of supported "
-					"OpenCL devices.\n",
-					nopencl, STARPU_MAXOPENCLDEVS);
+				_STARPU_DISP("Warning: %d OpenCL devices requested. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices.\n", nopencl, STARPU_MAXOPENCLDEVS);
 				nopencl = STARPU_MAXOPENCLDEVS;
 				nopencl = STARPU_MAXOPENCLDEVS;
 			}
 			}
 		}
 		}
@@ -610,10 +590,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			STARPU_ASSERT(ngordon <= NMAXGORDONSPUS);
 			STARPU_ASSERT(ngordon <= NMAXGORDONSPUS);
 			if (ngordon > STARPU_MAXGORDONSPUS);
 			if (ngordon > STARPU_MAXGORDONSPUS);
 			{
 			{
-				fprintf(stderr,
-					"# Warning: %d Gordon CPUs devices "
-					"requested. Only %d supported\n",
-					ngordon, NMAXGORDONSPUS);
+				_STARPU_DISP("Warning: %d Gordon CPUs devices requested. Only %d supported\n", ngordon, NMAXGORDONSPUS);
 				ngordon = NMAXGORDONSPUS;
 				ngordon = NMAXGORDONSPUS;
 			}
 			}
 		}
 		}
@@ -658,13 +635,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		{
 		{
 			if (ncpu > STARPU_MAXCPUS)
 			if (ncpu > STARPU_MAXCPUS)
 			{
 			{
-				fprintf(stderr,
-					"# Warning: %d CPU devices requested."
-					" Only %d enabled. Use configure "
-					"option --enable-maxcpus=xxx to "
-					"update the maximum value of "
-					"supported CPU devices.\n",
-					ncpu, STARPU_MAXCPUS);
+				_STARPU_DISP("Warning: %d CPU devices requested. Only %d enabled. Use configure option --enable-maxcpus=xxx to update the maximum value of supported CPU devices.\n", ncpu, STARPU_MAXCPUS);
 				ncpu = STARPU_MAXCPUS;
 				ncpu = STARPU_MAXCPUS;
 			}
 			}
 		}
 		}
@@ -757,8 +728,7 @@ _starpu_bind_thread_on_cpu (
 	DWORD mask = 1 << cpuid;
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{
 	{
-		fprintf(stderr,"SetThreadMaskAffinity(%lx) failed\n", mask);
-		STARPU_ABORT();
+		_STARPU_ERROR("SetThreadMaskAffinity(%lx) failed\n", mask);
 	}
 	}
 #else
 #else
 #warning no CPU binding support
 #warning no CPU binding support

+ 9 - 9
src/core/workers.c

@@ -652,31 +652,31 @@ int starpu_init(struct starpu_conf *user_conf)
 
 
 #ifdef __GNUC__
 #ifdef __GNUC__
 #ifndef __OPTIMIZE__
 #ifndef __OPTIMIZE__
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
 #endif
 #endif
 #endif
 #endif
 #if 0
 #if 0
 #ifndef STARPU_NO_ASSERT
 #ifndef STARPU_NO_ASSERT
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured without --enable-fast\n");
+	_STARPU_DISP("Warning: StarPU was configured without --enable-fast\n");
 #endif
 #endif
 #endif
 #endif
 #ifdef STARPU_MEMORY_STATUS
 #ifdef STARPU_MEMORY_STATUS
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-memory-status, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-memory-status, which slows down a bit\n");
 #endif
 #endif
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-verbose, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-verbose, which slows down a bit\n");
 #endif
 #endif
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --with-fxt, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --with-fxt, which slows down a bit\n");
 #endif
 #endif
 #ifdef STARPU_PERF_DEBUG
 #ifdef STARPU_PERF_DEBUG
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
 #endif
 #endif
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-model-debug, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-model-debug, which slows down a bit\n");
 #endif
 #endif
 #ifdef STARPU_DATA_STATS
 #ifdef STARPU_DATA_STATS
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-stats, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-stats, which slows down a bit\n");
 #endif
 #endif
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
@@ -725,7 +725,7 @@ int starpu_init(struct starpu_conf *user_conf)
 	else
 	else
 	{
 	{
 	     if (user_conf->magic != 42) {
 	     if (user_conf->magic != 42) {
-		fprintf(stderr, "starpu_conf structure needs to be initialized with starpu_conf_init\n");
+		_STARPU_DISP("starpu_conf structure needs to be initialized with starpu_conf_init\n");
 		return -EINVAL;
 		return -EINVAL;
 	     }
 	     }
 	     config.conf = user_conf;
 	     config.conf = user_conf;

+ 1 - 1
src/datawizard/interfaces/bcsr_interface.c

@@ -389,9 +389,9 @@ fail_colind:
 		{
 		{
 			cudaError_t err;
 			cudaError_t err;
 			err = cudaFree((void*)addr_nzval);
 			err = cudaFree((void*)addr_nzval);
-			break;
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);
 				STARPU_CUDA_REPORT_ERROR(err);
+			break;
 		}
 		}
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL

+ 6 - 0
src/datawizard/interfaces/coo_interface.c

@@ -368,8 +368,14 @@ allocate_coo_buffer_on_node(void *data_interface, uint32_t dst_node)
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	{
 	{
 		addr_columns = malloc(n_values * sizeof(coo_interface->columns[0]));
 		addr_columns = malloc(n_values * sizeof(coo_interface->columns[0]));
+		if (STARPU_UNLIKELY(addr_columns == NULL))
+			goto fail_columns;
 		addr_rows = malloc(n_values * sizeof(coo_interface->rows[0]));
 		addr_rows = malloc(n_values * sizeof(coo_interface->rows[0]));
+		if (STARPU_UNLIKELY(addr_rows == NULL))
+			goto fail_rows;
 		addr_values = (uintptr_t) malloc(n_values * elemsize);
 		addr_values = (uintptr_t) malloc(n_values * elemsize);
+		if (STARPU_UNLIKELY(addr_values == NULL))
+			goto fail_values;
 		break;
 		break;
 	}
 	}
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 2 - 1
src/datawizard/interfaces/data_interface.c

@@ -681,9 +681,10 @@ int starpu_data_interface_get_next_id(void)
 	return _data_interface_number-1;
 	return _data_interface_number-1;
 }
 }
 
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr)
+int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count)
 {
 {
 	STARPU_ASSERT(handle->ops->pack_data);
 	STARPU_ASSERT(handle->ops->pack_data);
+	*count = starpu_handle_get_size(handle);
 	return handle->ops->pack_data(handle, _starpu_get_local_memory_node(), ptr);
 	return handle->ops->pack_data(handle, _starpu_get_local_memory_node(), ptr);
 }
 }
 
 

+ 1 - 1
src/drivers/driver_common/driver_common.c

@@ -78,7 +78,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
 
 	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
 	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
 
 
-	if (cl->model && cl->model->benchmarking)
+	if (cl && cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 		calibrate_model = 1;
 
 
 	if (rank == 0)
 	if (rank == 0)

+ 24 - 10
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -175,7 +175,8 @@ static int good_job(struct _starpu_job *j)
 	if (!j->task->cl->model)
 	if (!j->task->cl->model)
 		return 0;
 		return 0;
 	/* Only support history based */
 	/* Only support history based */
-	if (j->task->cl->model->type != STARPU_HISTORY_BASED)
+	if (j->task->cl->model->type != STARPU_HISTORY_BASED
+	 && j->task->cl->model->type != STARPU_NL_REGRESSION_BASED)
 		return 0;
 		return 0;
 	return 1;
 	return 1;
 }
 }
@@ -400,7 +401,7 @@ void starpu_bound_print_dot(FILE *output)
 	fprintf(output, "strict digraph bounddeps {\n");
 	fprintf(output, "strict digraph bounddeps {\n");
 	for (t = tasks; t; t = t->next)
 	for (t = tasks; t; t = t->next)
 	{
 	{
-		fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, t->cl->name);
+		fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, _starpu_codelet_get_model_name(t->cl));
 		for (i = 0; i < t->depsn; i++)
 		for (i = 0; i < t->depsn; i++)
 			fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i]->id, t->id);
 			fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i]->id, t->id);
 	}
 	}
@@ -433,6 +434,11 @@ void starpu_bound_print_lp(FILE *output)
 		nt = 0;
 		nt = 0;
 		for (t1 = tasks; t1; t1 = t1->next)
 		for (t1 = tasks; t1; t1 = t1->next)
 		{
 		{
+			if (t1->cl->model->type != STARPU_HISTORY_BASED &&
+			    t1->cl->model->type != STARPU_NL_REGRESSION_BASED)
+				/* TODO: */
+				fprintf(stderr, "Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
+
 			struct _starpu_job j =
 			struct _starpu_job j =
 			{
 			{
 				.footprint = t1->footprint,
 				.footprint = t1->footprint,
@@ -478,7 +484,7 @@ void starpu_bound_print_lp(FILE *output)
 		fprintf(output, "/* According to where the task is indeed executed */\n");
 		fprintf(output, "/* According to where the task is indeed executed */\n");
 		for (t1 = tasks; t1; t1 = t1->next)
 		for (t1 = tasks; t1; t1 = t1->next)
 		{
 		{
-			fprintf(output, "/* %s %x */\tc%lu = s%lu", t1->cl->name, (unsigned) t1->footprint, t1->id, t1->id);
+			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 			{
 			{
 				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
 				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
@@ -673,11 +679,19 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* And we have to have computed exactly all tasks */\n");
 			fprintf(output, "/* And we have to have computed exactly all tasks */\n");
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 			{
 			{
-				fprintf(output, "/* task %s key %x */\n0", tp->cl->name, (unsigned) tp->footprint);
-				for (w = 0; w < nw; w++)
-					if (!isnan(times[w*nt+t]))
+				int got_one = 0;
+				fprintf(output, "/* task %s key %x */\n0", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
+				for (w = 0; w < nw; w++) {
+					if (isnan(times[w*nt+t]))
+						fprintf(stderr, "Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
+					else {
+						got_one = 1;
 						fprintf(output, "\t+w%dt%dn", w, t);
 						fprintf(output, "\t+w%dt%dn", w, t);
+					}
+				}
 				fprintf(output, " = %lu;\n", tp->n);
 				fprintf(output, " = %lu;\n", tp->n);
+				if (!got_one)
+					fprintf(stderr, "Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
 				/* Show actual values */
 				/* Show actual values */
 				fprintf(output, "/*");
 				fprintf(output, "/*");
 				for (w = 0; w < nw; w++)
 				for (w = 0; w < nw; w++)
@@ -750,7 +764,7 @@ void starpu_bound_print_mps(FILE *output)
 		fprintf(output, "\n* And we have to have computed exactly all tasks\n");
 		fprintf(output, "\n* And we have to have computed exactly all tasks\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		{
 		{
-			fprintf(output, "* task %s key %x\n", tp->cl->name, (unsigned) tp->footprint);
+			fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			fprintf(output, " E  T%d\n", t);
 			fprintf(output, " E  T%d\n", t);
 		}
 		}
 
 
@@ -881,7 +895,7 @@ static glp_prob *_starpu_bound_glp_resolve(int integer)
 		{
 		{
 			char name[32], title[64];
 			char name[32], title[64];
 			starpu_worker_get_name(w, name, sizeof(name));
 			starpu_worker_get_name(w, name, sizeof(name));
-			snprintf(title, sizeof(title), "task %s key %x", tp->cl->name, (unsigned) tp->footprint);
+			snprintf(title, sizeof(title), "task %s key %x", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			glp_set_row_name(lp, nw+t+1, title);
 			glp_set_row_name(lp, nw+t+1, title);
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 			{
 			{
@@ -949,7 +963,7 @@ void starpu_bound_print(FILE *output, int integer __attribute__ ((unused)))
 
 
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		{
 		{
-			fprintf(output, "%s key %x\n", tp->cl->name, (unsigned) tp->footprint);
+			fprintf(output, "%s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 				if (integer)
 				if (integer)
 					fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
 					fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));

+ 48 - 0
src/util/misc.c

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/utils.h>
+#include <core/jobs.h>
+
+const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl)
+{
+	if (!cl)
+		return NULL;
+
+	if (cl->model && cl->model->symbol && cl->model->symbol[0])
+		return cl->model->symbol;
+	else
+		return cl->name;
+}
+
+const char *_starpu_job_get_model_name(struct _starpu_job *j)
+{
+	const char *ret = NULL;
+
+	if (!j)
+		return NULL;
+
+	struct starpu_task *task = j->task;
+	if (task)
+		ret = _starpu_codelet_get_model_name(task->cl);
+
+#ifdef STARPU_USE_FXT
+	if (!ret)
+                ret = j->model_name;
+#endif
+        return ret;
+}

+ 1 - 2
starpu-1.0.pc.in

@@ -33,5 +33,4 @@ Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@
 @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@
 @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@
 @STARPU_SCHED_CTX_HYPERVISOR@
 @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
-Requires: @HWLOC_REQUIRES@
-#Requires.private: @GORDON_REQUIRES@
+Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpufft/starpufft-double.h

@@ -15,10 +15,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#include <complex.h>
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
+#include <complex.h>
 #include <fftw3.h>
 #include <fftw3.h>
 #endif
 #endif
 
 

+ 1 - 1
starpufft/starpufft-float.h

@@ -15,10 +15,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#include <complex.h>
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
+#include <complex.h>
 #include <fftw3.h>
 #include <fftw3.h>
 #endif
 #endif
 
 

+ 2 - 0
tests/microbenchs/tasks_size_overhead.gp

@@ -16,6 +16,8 @@ gnuplot << EOF
 set terminal eps
 set terminal eps
 set output "tasks_size_overhead.eps"
 set output "tasks_size_overhead.eps"
 set key top left
 set key top left
+set xlabel "number of cores"
+set ylabel "speedup"
 plot \
 plot \
 	"$OUTPUT" using 1:($VAL1)/(\$3) with linespoints title columnheader(2), \
 	"$OUTPUT" using 1:($VAL1)/(\$3) with linespoints title columnheader(2), \
 	"$OUTPUT" using 1:($VAL2)/(\$5) with linespoints title columnheader(4), \
 	"$OUTPUT" using 1:($VAL2)/(\$5) with linespoints title columnheader(4), \