Andra Hugo лет назад: 13
Родитель
Сommit
927b47d57d
71 измененных файлов с 1431 добавлено и 888 удалено
  1. 15 2
      ChangeLog
  2. 5 1
      Makefile.am
  3. 27 33
      configure.ac
  4. 25 3
      doc/chapters/advanced-examples.texi
  5. 16 8
      doc/chapters/basic-api.texi
  6. 11 9
      doc/chapters/configuration.texi
  7. 36 15
      doc/chapters/mpi-support.texi
  8. 20 3
      doc/starpu.css
  9. 1 0
      include/starpu_config.h.in
  10. 3 7
      include/starpu_data_interfaces.h
  11. 1 2
      libstarpu.pc.in
  12. 0 30
      mpi/examples/Makefile.am
  13. 4 1
      mpi/examples/cholesky/mpi_cholesky.c
  14. 4 1
      mpi/examples/cholesky/mpi_cholesky_distributed.c
  15. 4 1
      mpi/examples/complex/mpi_complex.c
  16. 2 4
      mpi/examples/mpi_lu/plu_example.c
  17. 9 2
      mpi/examples/stencil/stencil5.c
  18. 8 4
      mpi/include/starpu_mpi.h
  19. 239 210
      mpi/src/starpu_mpi.c
  20. 104 7
      mpi/src/starpu_mpi_collective.c
  21. 1 3
      mpi/src/starpu_mpi_datatype.c
  22. 1 1
      mpi/src/starpu_mpi_datatype.h
  23. 5 4
      mpi/src/starpu_mpi_helper.c
  24. 230 132
      mpi/src/starpu_mpi_insert_task.c
  25. 15 6
      mpi/src/starpu_mpi_private.h
  26. 7 5
      mpi/src/starpu_mpi_stats.c
  27. 14 4
      mpi/tests/Makefile.am
  28. 6 13
      mpi/tests/block_interface.c
  29. 6 13
      mpi/tests/block_interface_pinned.c
  30. 4 0
      mpi/tests/helper.h
  31. 5 8
      mpi/tests/insert_task.c
  32. 4 7
      mpi/tests/insert_task_block.c
  33. 84 93
      mpi/tests/insert_task_cache.c
  34. 7 10
      mpi/tests/insert_task_owner.c
  35. 4 2
      mpi/tests/insert_task_owner2.c
  36. 4 2
      mpi/tests/insert_task_owner_data.c
  37. 7 8
      mpi/tests/mpi_detached_tag.c
  38. 7 8
      mpi/tests/mpi_irecv.c
  39. 7 8
      mpi/tests/mpi_irecv_detached.c
  40. 7 8
      mpi/tests/mpi_isend.c
  41. 7 8
      mpi/tests/mpi_isend_detached.c
  42. 30 11
      mpi/examples/reduction/mpi_reduction.c
  43. 10 0
      mpi/examples/reduction/mpi_reduction_kernels.c
  44. 18 3
      mpi/examples/scatter_gather/mpi_scatter_gather.c
  45. 7 8
      mpi/tests/mpi_test.c
  46. 5 3
      mpi/tests/multiple_send.c
  47. 9 7
      mpi/tests/pingpong.c
  48. 7 3
      mpi/tests/ring.c
  49. 7 3
      mpi/tests/ring_async.c
  50. 9 11
      mpi/tests/ring_async_implicit.c
  51. 1 0
      src/Makefile.am
  52. 4 3
      src/common/fxt.h
  53. 192 27
      src/common/utils.h
  54. 0 43
      src/core/jobs.c
  55. 0 3
      src/core/jobs.h
  56. 7 7
      src/core/perfmodel/perfmodel_bus.c
  57. 26 1
      src/core/perfmodel/perfmodel_history.c
  58. 1 2
      src/core/sched_policy.c
  59. 29 4
      src/core/task.c
  60. 7 37
      src/core/topology.c
  61. 9 9
      src/core/workers.c
  62. 1 1
      src/datawizard/interfaces/bcsr_interface.c
  63. 6 0
      src/datawizard/interfaces/coo_interface.c
  64. 2 1
      src/datawizard/interfaces/data_interface.c
  65. 1 1
      src/drivers/driver_common/driver_common.c
  66. 24 10
      src/profiling/bound.c
  67. 48 0
      src/util/misc.c
  68. 1 2
      starpu-1.0.pc.in
  69. 1 1
      starpufft/starpufft-double.h
  70. 1 1
      starpufft/starpufft-float.h
  71. 2 0
      tests/microbenchs/tasks_size_overhead.gp

+ 15 - 2
ChangeLog

@@ -36,8 +36,20 @@ New features:
   * SOCL
         - Manual mapping of commands on specific devices is now possible
   * New interface: COO matrix.
-  * Communication statistics for MPI can also be enabled at execution
-    time by defining the environment variable STARPU_COMM_STATS
+  * MPI:
+        - Communication statistics for MPI can only be enabled at
+	  execution time by defining the environment variable
+	  STARPU_COMM_STATS
+        - Communication cache mechanism is enabled by default, and can
+	  only be disabled at execution time by setting the
+	  environment variable STARPU_MPI_CACHE to 0.
+        - Initialisation functions starpu_mpi_initialize_extended()
+  	  and starpu_mpi_initialize() have been made deprecated. One
+	  should now use starpu_mpi_init() which will initialise MPI
+	  by calling MPI_Init_Thread if is not already done.
+        - Collective detached operations have new parameters, a
+	  callback function and a argument. This is to be consistent
+	  with the detached point-to-point communications.
 
 Changes:
   * Fix the block filter functions.
@@ -73,6 +85,7 @@ Small changes:
   * Fix forcing calibration of never-calibrated archs.
   * CUDA applications are no longer compiled with the "-arch sm_13"
     option. It is specifically added to applications which need it.
+  * Documentation is not built if necessary tools are missing
 
 StarPU 1.0.3 (svn revision 7379)
 ==============================================

+ 5 - 1
Makefile.am

@@ -18,7 +18,11 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 SUBDIRS = src
-SUBDIRS += tools tests doc
+SUBDIRS += tools tests
+
+if BUILD_DOC
+SUBDIRS += doc
+endif
 
 if USE_MPI
 SUBDIRS += mpi

+ 27 - 33
configure.ac

@@ -1094,14 +1094,6 @@ else
 fi
 AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
-# Check if user wants to disable mpi cache
-AC_ARG_ENABLE(mpi-cache, AC_HELP_STRING([--enable-mpi-cache],
-			 [Enable MPI communication cache]),
-			 enable_mpi_cache=$enableval, enable_mpi_cache=true)
-if  test x$enable_mpi_cache = xtrue; then
-    	AC_DEFINE(STARPU_MPI_CACHE, [1], [enable MPI communication cache])
-fi
-
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 running_mpi_check=no
@@ -1156,18 +1148,6 @@ if test x$use_mpi = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
 fi
 
-AC_MSG_CHECKING(whether communication statistics should be generated)
-AC_ARG_ENABLE(comm-stats, [AS_HELP_STRING([--enable-comm-stats],
-			[enable communication statistics (only valid with the StarPU MPI library])],
-			enable_comm_stats=$enableval, enable_comm_stats=no)
-AC_MSG_RESULT($enable_comm_stats)
-AC_SUBST(STATS, $enable_comm_stats)
-AC_SUBST(STARPU_COMM_STATS, $enable_comm_stats)
-
-if test x$enable_comm_stats = xyes; then
-        AC_DEFINE(STARPU_COMM_STATS, [1], [enable communication statistics])
-fi
-
 ###############################################################################
 #                                                                             #
 #                               StarPU-Top                                    #
@@ -1627,7 +1607,6 @@ AC_ARG_WITH([hwloc],
 		fi
 	],
 	[
-	AC_MSG_NOTICE("MAYBE HWLOC")
 		use_hwloc=maybe
 		use_hwloc_from_system=yes
 	])
@@ -1676,10 +1655,7 @@ AC_MSG_RESULT($have_valid_hwloc)
 AC_SUBST(HWLOC_REQUIRES)
 
 # is the header file f77.h available ?
-have_f77_h=yes
-AC_CHECK_HEADER([f77.h],,[have_f77_h=no])
-AC_MSG_CHECKING(whether header file f77.h is available)
-AC_MSG_RESULT($have_f77_h)
+AC_CHECK_HEADER([f77.h], [have_f77_h=yes], [have_f77_h=no])
 AC_SUBST(STARPU_HAVE_F77_H, $have_f77_h)
 AM_CONDITIONAL(STARPU_HAVE_F77_H, test x$have_f77_h = xyes)
 if test x$have_f77_h = xyes; then
@@ -1741,6 +1717,22 @@ m4_ifdef([AM_SILENT_RULES],
   AM_CONDITIONAL([STARPU_HAVE_AM111], [true]),
   AM_CONDITIONAL([STARPU_HAVE_AM111], [false]))
 
+##########################################
+# Documentation                          #
+##########################################
+
+enable_build_doc=yes
+AC_CHECK_PROGS([CHECK_TEXI2DVI], [texi2dvi], "no")
+if test "$CHECK_TEXI2DVI" == "no" ; then
+    enable_build_doc=no
+else
+    AC_CHECK_PROGS([CHECK_TEX], [tex], "no")
+    if test "$CHECK_TEX" == "no" ; then
+	enable_build_doc=no
+    fi
+fi
+AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
+
 ###############################################################################
 #                                                                             #
 #                                Final settings                               #
@@ -1835,22 +1827,24 @@ AC_MSG_NOTICE([
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	Allocation cache:  $enable_allocation_cache
 
-	Magma enabled: $have_magma
-	BLAS library:  $blas_lib
-	hwloc:         $have_valid_hwloc
+	Magma enabled:     $have_magma
+	BLAS library:      $blas_lib
+	hwloc:             $have_valid_hwloc
 	FxT trace enabled: $use_fxt
 	StarPU-Top:        $build_starpu_top
 
+        Documentation:     $enable_build_doc
+        Examples:          $enable_build_examples
+
 	StarPU Extensions:
-	       MPI enabled:   $use_mpi
-	       MPI test suite: $running_mpi_check
-	       FFT Support: $fft_support
-	       GCC plug-in: $build_gcc_plugin
+	       MPI enabled:                                 $use_mpi
+	       MPI test suite:                              $running_mpi_check
+	       FFT Support:                                 $fft_support
+	       GCC plug-in:                                 $build_gcc_plugin
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
 	       SOCL enabled:  $build_socl
                Scheduler Hypervisor: $build_sched_ctx_hypervisor
                SOCL test suite: $run_socl_check
-
 ])
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then

+ 25 - 3
doc/chapters/advanced-examples.texi

@@ -381,13 +381,15 @@ tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
 and maximum observed input size. It can be useful to set the
 @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
-on varying input sizes, so as to feed the performance model for a variety of
-inputs, or to provide the measurements explictly by using
+on varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,
+so as to feed the performance model for a variety of
+inputs. The application can also provide the measurements explictly by using
 @code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
 @code{starpu_perfmodel_plot}
 tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
 their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
-StarPU use the resulting performance model without recording new measures. If
+StarPU use the resulting performance model without recording new measures, and
+@code{STARPU_SCHED} can be set to @code{heft} to benefit from the performance models. If
 the data input sizes vary a lot, it is really important to set
 @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
 measures, and result with a very big performance model, which will take time a
@@ -724,6 +726,26 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 to yet more relaxed dependencies and more parallelism.
 
+STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPI
+case. That will however not produce any MPI communication, but just pass
+STARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to the
+application to call @code{starpu_mpi_redux_data}, which posts tasks that will
+reduce the partial results among MPI nodes into the MPI node which owns the
+data. For instance, some hypothetical application which collects partial results
+into data @code{res}, then uses it for other computation, before looping again
+with a new reduction:
+
+@smallexample
+@{
+    for (i = 0; i < 100; i++) @{
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
+        starpu_mpi_redux_data(MPI_COMM_WORLD, res);
+        starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+    @}
+@}
+@end smallexample
+
 @node Temporary buffers
 @section Temporary buffers
 

+ 16 - 8
doc/chapters/basic-api.texi

@@ -417,7 +417,7 @@ extensions at least to define the initial value.  For now, data to be
 used in @code{SCRATCH} mode should be registered with node @code{-1} and
 a @code{NULL} pointer, since the value of the provided buffer is simply
 ignored for now.
-@item @code{STARPU_REDUX} reduction mode.
+@item @code{STARPU_REDUX}: reduction mode. TODO!
 @end table
 @end deftp
 
@@ -465,6 +465,11 @@ This is the same as starpu_data_unregister, except that StarPU does not put back
 a valid copy into the home node, in the buffer that was initially registered.
 @end deftypefun
 
+@deftypefun void starpu_data_unregister_submit (starpu_data_handle_t @var{handle})
+Destroy the data handle once it is not needed anymore by any submitted
+task. No coherency is assumed.
+@end deftypefun
+
 @deftypefun void starpu_data_invalidate (starpu_data_handle_t @var{handle})
 Destroy all replicates of the data handle. After data invalidation, the first
 access to the handle must be performed in write-only mode. Accessing an
@@ -752,11 +757,14 @@ Return the unique identifier of the interface associated with the given @var{han
 Return the size of the data associated with @var{handle}
 @end deftypefun
 
-@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr})
-Allocates a buffer large enough at @var{ptr} and copy to the newly
-allocated buffer the data associated to @var{handle}. The interface of
-the data registered at @var{handle} must define a packing operation
-(@pxref{struct starpu_data_interface_ops}).
+@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr}, {size_t *}@var{count})
+Execute the packing operation of the interface of the data registered
+at @var{handle} (@pxref{struct starpu_data_interface_ops}). This
+packing operation must allocate a buffer large enough at @var{ptr} and
+copy into the newly allocated buffer the data associated to
+@var{handle}.
+The function also sets @var{count} to the size of the data handle by calling
+@code{starpu_handle_get_size()}.
 @end deftypefun
 
 @deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
@@ -2428,7 +2436,7 @@ This function returns a pointer to device properties for worker @var{workerid}
 (assumed to be a CUDA worker).
 @end deftypefun
 
-@deftypefun size_t starpu_cuda_get_global_mem_size (int @var{devid})
+@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
 Return the size of the global memory of CUDA device @var{devid}.
 @end deftypefun
 
@@ -2452,7 +2460,7 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 @end deftypefun
 
-@deftypefun void starpu_cuda_set_device (int @var{devid})
+@deftypefun void starpu_cuda_set_device (unsigned @var{devid})
 Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
 whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
 the @code{starpu_conf} structure.

+ 11 - 9
doc/chapters/configuration.texi

@@ -172,11 +172,6 @@ enabled when the GCC compiler provides a plug-in support.
 Use the @command{mpicc} compiler at @var{path}, for starpumpi
 (@pxref{StarPU MPI support}).
 
-@item --enable-comm-stats
-@anchor{enable-comm-stats}
-Enable communication statistics for starpumpi (@pxref{StarPU MPI
-support}).
-
 @end table
 
 @node Advanced configuration
@@ -425,11 +420,17 @@ THE SOCL test suite is only run when the environment variable
 of the libOpenCL.so file of the OCL ICD implementation.
 
 @item @code{STARPU_COMM_STATS}
+@anchor{STARPU_COMM_STATS}
 Communication statistics for starpumpi (@pxref{StarPU MPI support})
 will be enabled when the environment variable @code{STARPU_COMM_STATS}
-is defined. The statistics can also be enabled by configuring StarPU
-with the option @code{--enable-comm-stats} (@pxref{enable-comm-stats}).
-
+is defined to an value other than 0.
+
+@item @code{STARPU_MPI_CACHE}
+@anchor{STARPU_MPI_CACHE}
+Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
+disabled when the environment variable @code{STARPU_MPI_CACHE} is set
+to 0. It is enabled by default or for any other values of the variable
+@code{STARPU_MPI_CACHE}.
 @end table
 
 @node Misc
@@ -439,7 +440,8 @@ with the option @code{--enable-comm-stats} (@pxref{enable-comm-stats}).
 
 @item @code{STARPU_SILENT}
 This variable allows to disable verbose mode at runtime when StarPU
-has been configured with the option @code{--enable-verbose}.
+has been configured with the option @code{--enable-verbose}. It also
+disables the display of StarPU information and warning messages.
 
 @item @code{STARPU_LOGFILENAME}
 This variable specifies in which file the debugging output should be saved to.

+ 36 - 15
doc/chapters/mpi-support.texi

@@ -44,25 +44,33 @@ Also pass the @code{--static} option if the application is to be linked statical
 
 @subsection Initialisation
 
+@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv})
+Initializes the starpumpi library. If MPI is not already initialized,
+it will be by calling @code{MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)}.
+@end deftypefun
+
 @deftypefun int starpu_mpi_initialize (void)
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions. This
-function does not call @code{MPI_Init}, it should be called beforehand.
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
 @end deftypefun
 
 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions.
-This function calls @code{MPI_Init}, and therefore should be prefered
-to the previous one for MPI implementations which are not thread-safe.
-Returns the current MPI node rank and world size.
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
 @end deftypefun
 
 @deftypefun int starpu_mpi_shutdown (void)
 Cleans the starpumpi library. This must be called between calling
-@code{starpu_mpi} functions and @code{starpu_shutdown}.
-@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
-by calling @code{starpu_mpi_initialize_extended}.
+@code{starpu_mpi} functions and @code{starpu_shutdown()}.
+@code{MPI_Finalize()} will be called if StarPU-MPI has been initialized
+by @code{starpu_mpi_init()}.
+@end deftypefun
+
+@deftypefun void starpu_mpi_comm_amounts_retrieve (size_t *@var{comm_amounts})
+Retrieve the current amount of communications from the current node in
+the array @code{comm_amounts} which must have a size greater or equal
+to the world size. Communications statistics must be enabled
+(@pxref{STARPU_COMM_STATS}).
 @end deftypefun
 
 @subsection Communication
@@ -375,8 +383,10 @@ latter receives them.
 written data back to their owners.
 @end enumerate
 
-The algorithm also includes a cache mechanism that allows not to send
-data twice to the same MPI node, unless the data has been modified.
+The algorithm also includes a communication cache mechanism that
+allows not to send data twice to the same MPI node, unless the data
+has been modified. The cache can be disabled
+(@pxref{STARPU_MPI_CACHE}).
 
 @end deftypefun
 
@@ -469,21 +479,32 @@ each task, only the MPI node which owns the data being written to (here,
 @code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
 automatically send the required data.
 
+This can be a concern with a growing number of nodes. To avoid this, the
+application can prune the task for loops according to the data distribution,
+so as to only submit tasks on nodes which have to care about them (either to
+execute them, or to send the required data).
+
 @node MPI Collective Operations
 @section MPI Collective Operations
 
-@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Scatter data among processes of the communicator based on the ownership of
 the data. For each data of the array @var{data_handles}, the
 process @var{root} sends the data to the process owning this data.
 Processes receiving data must have valid data handles to receive them.
+On completion of the collective communication, the @var{scallback} function is
+called with the argument @var{sarg} on the process @var{root}, the @var{rcallback} function is
+called with the argument @var{rarg} on any other process.
 @end deftypefun
 
-@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Gather data from the different processes of the communicator onto the
 process @var{root}. Each process owning data handle in the array
 @var{data_handles} will send them to the process @var{root}. The
 process @var{root} must have valid data handles to receive the data.
+On completion of the collective communication, the @var{rcallback} function is
+called with the argument @var{rarg} on the process @var{root}, the @var{scallback} function is
+called with the argument @var{sarg} on any other process.
 @end deftypefun
 
 @page

+ 20 - 3
doc/starpu.css

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * Permission is granted to copy, distribute and/or modify this document
  * under the terms of the GNU Free Documentation License, Version 1.3
@@ -11,7 +11,7 @@
  */
 
 body {
-	font-size: 13px;
+	font-family: sans-serif;
 /*	margin-top: 0px; */
 }
 
@@ -39,7 +39,7 @@ div.node hr.node {
 h1 {
 	font: bold normal 2.5em sans-serif ;
 	margin: 0px;
-	color: #0020a0;
+	color: rgb(226,0,38);
 }
 h1.sub {
 	font: bold normal 2em sans-serif ;
@@ -141,3 +141,20 @@ p.updated {
 	font-size: 10px;
 	font-style: italic;
 }
+
+div.contents {
+	margin-top: 12px;
+	margin-bottom: 3px;
+	font-variant: small-caps;
+	padding-left: 1em;
+	padding-right: 1em;
+	padding-top: 1px;
+	padding-bottom: 1px;
+	margin-top: 12px;
+	margin-bottom: 12px;
+	margin-top:0px;
+	margin-left: auto;
+	margin-right: auto;
+	border-top: 4px solid rgb(204,209,222);
+	border-bottom: 4px solid rgb(204,209,222);
+}

+ 1 - 0
include/starpu_config.h.in

@@ -79,6 +79,7 @@
 #undef STARPU_HAVE_LIBNUMA
 
 #undef STARPU_HAVE_WINDOWS
+#undef STARPU_HAVE_UNSETENV
 
 #ifdef _MSC_VER
 typedef long starpu_ssize_t;

+ 3 - 7
include/starpu_data_interfaces.h

@@ -132,7 +132,7 @@ struct starpu_data_interface_ops
 	int is_multiformat;
 	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 
-	/* Pack the data handle into a contiguous buffer at the address ptr */
+	/* Pack the data handle into a contiguous buffer at the address ptr and store the size of the buffer in count */
 	int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr);
 	/* Unpack the data handle from the contiguous buffer at the address ptr */
 	int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr);
@@ -209,11 +209,7 @@ struct starpu_coo_interface
 	size_t    elemsize;
 };
 
-void
-starpu_coo_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
-			 uint32_t nx, uint32_t ny, uint32_t n_values,
-			 uint32_t *columns, uint32_t *rows,
-			 uintptr_t values, size_t elemsize);
+void starpu_coo_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, uint32_t nx, uint32_t ny, uint32_t n_values, uint32_t *columns, uint32_t *rows, uintptr_t values, size_t elemsize);
 
 #define STARPU_COO_GET_COLUMNS(interface) \
 	(((struct starpu_coo_interface *)(interface))->columns)
@@ -438,7 +434,7 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, uint32_t hom
 
 enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr);
+int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count);
 int starpu_handle_unpack_data(starpu_data_handle_t handle, void *ptr);
 size_t starpu_handle_get_size(starpu_data_handle_t handle);
 

+ 1 - 2
libstarpu.pc.in

@@ -25,5 +25,4 @@ Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
-Requires: @HWLOC_REQUIRES@
-#Requires.private: @GORDON_REQUIRES@
+Requires: @HWLOC_REQUIRES@

+ 0 - 30
mpi/examples/Makefile.am

@@ -153,36 +153,6 @@ check_PROGRAMS +=					\
 	cholesky/mpi_cholesky_distributed
 endif
 
-########################
-# Scatter Gather       #
-########################
-
-examplebin_PROGRAMS +=		\
-	scatter_gather/mpi_scatter_gather
-
-scatter_gather_mpi_scatter_gather_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
-check_PROGRAMS +=		\
-	scatter_gather/mpi_scatter_gather
-
-###################
-# Reduction       #
-###################
-
-examplebin_PROGRAMS +=		\
-	reduction/mpi_reduction
-
-reduction_mpi_reduction_SOURCES =		\
-	reduction/mpi_reduction.c		\
-	reduction/mpi_reduction_kernels.c
-
-reduction_mpi_reduction_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
-check_PROGRAMS +=		\
-	reduction/mpi_reduction
-
 ###################
 # complex example #
 ###################

+ 4 - 1
mpi/examples/cholesky/mpi_cholesky.c

@@ -43,7 +43,10 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	starpu_mpi_init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+
 	starpu_helper_cublas_init();
 
 	if (dblockx == -1 || dblocky == -1)

+ 4 - 1
mpi/examples/cholesky/mpi_cholesky_distributed.c

@@ -42,7 +42,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	starpu_helper_cublas_init();
 
 	if (dblockx == -1 || dblocky == -1)

+ 4 - 1
mpi/examples/complex/mpi_complex.c

@@ -25,7 +25,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 
 	if (nodes < 2)
 	{

+ 2 - 4
mpi/examples/mpi_lu/plu_example.c

@@ -402,7 +402,6 @@ int main(int argc, char **argv)
 	int rank;
 	int world_size;
 
-#if 0
 	/*
 	 *	Initialization
 	 */
@@ -415,10 +414,9 @@ int main(int argc, char **argv)
 		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
 	if (thread_support < MPI_THREAD_FUNNELED)
 		fprintf(stderr,"Warning: MPI does not have thread support!\n");
-	
+
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-#endif
 
 	starpu_srand48((long int)time(NULL));
 
@@ -430,7 +428,7 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 
-	starpu_mpi_initialize_extended(&rank, &world_size);
+	starpu_mpi_init(NULL, NULL);
 
 	STARPU_ASSERT(p*q == world_size);
 

+ 9 - 2
mpi/examples/stencil/stencil5.c

@@ -37,7 +37,11 @@ struct starpu_codelet stencil5_cl =
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 
-#define NITER_DEF 500
+#ifdef STARPU_QUICK_CHECK
+#  define NITER_DEF	10
+#else
+#  define NITER_DEF	500
+#endif
 #define X         20
 #define Y         20
 
@@ -78,7 +82,10 @@ int main(int argc, char **argv)
 
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&my_rank, &size);
+	starpu_mpi_init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
         parse_args(argc, argv);
 
         for(x = 0; x < X; x++)

+ 8 - 4
mpi/include/starpu_mpi.h

@@ -39,8 +39,10 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
-int starpu_mpi_initialize(void);
-int starpu_mpi_initialize_extended(int *rank, int *world_size);
+int starpu_mpi_init(int *argc, char ***argv);
+
+int starpu_mpi_initialize(void) STARPU_DEPRECATED;
+int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
 int starpu_mpi_shutdown(void);
 
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
@@ -48,8 +50,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
-int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
 
 /* Some helper functions */
 
@@ -62,6 +64,8 @@ int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int s
 int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 
+/* retrieve the current amount of communications from the current node */
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 #ifdef __cplusplus
 }
 #endif

+ 239 - 210
mpi/src/starpu_mpi.c

@@ -28,8 +28,11 @@
  * configuration) */
 //#define USE_STARPU_ACTIVITY	1
 
-static void submit_mpi_req(void *arg);
-static void handle_request_termination(struct _starpu_mpi_req *req);
+static void _starpu_mpi_submit_new_mpi_request(void *arg);
+static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
+#ifdef STARPU_MPI_VERBOSE
+static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
+#endif
 
 /* The list of requests that have been newly submitted by the application */
 static struct _starpu_mpi_req_list *new_requests;
@@ -50,52 +53,27 @@ static int running = 0;
 static pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
-#define INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
+#define _STARPU_MPI_INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
-/*
- *	Isend
- */
 
-static void starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+/********************************************************/
+/*                                                      */
+/*  Send/Receive functionalities                        */
+/*                                                      */
+/********************************************************/
+
+static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
+							      int srcdst, int mpi_tag, MPI_Comm comm,
+							      unsigned detached, void (*callback)(void *), void *arg,
+							      enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
+							      enum starpu_access_mode mode)
 {
-	int count;
 
         _STARPU_MPI_LOG_IN();
-
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
-	if (req->needs_unpacking)
-		starpu_handle_pack_data(req->data_handle, &req->ptr);
-	else
-		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
-	STARPU_ASSERT(req->ptr);
-
-        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, count, &req->request);
-
-	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, count);
-
-        req->ret = MPI_Isend(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
-
-	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
-
-	/* somebody is perhaps waiting for the MPI request to be posted */
-	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
-	req->submitted = 1;
-	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
-}
-
-static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
-							int dest, int mpi_tag, MPI_Comm comm,
-							unsigned detached, void (*callback)(void *), void *arg)
-{
 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(req);
 
-        _STARPU_MPI_LOG_IN();
-
-        INC_POSTED_REQUESTS(1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 	/* Initialize the request structure */
 	req->submitted = 0;
@@ -103,27 +81,74 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
 
-	req->request_type = SEND_REQ;
+	req->request_type = request_type;
 
 	req->data_handle = data_handle;
-	req->srcdst = dest;
+	req->srcdst = srcdst;
 	req->mpi_tag = mpi_tag;
 	req->comm = comm;
-	req->func = starpu_mpi_isend_func;
 
 	req->detached = detached;
 	req->callback = callback;
 	req->callback_arg = arg;
 
+	req->func = func;
+
 	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, submit_mpi_req(req) is called and
+	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
 	 * the request is actually submitted  */
-	starpu_data_acquire_cb(data_handle, STARPU_R, submit_mpi_req, (void *)req);
+	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
 
         _STARPU_MPI_LOG_OUT();
 	return req;
 }
 
+/********************************************************/
+/*                                                      */
+/*  Send functionalities                                */
+/*                                                      */
+/********************************************************/
+
+static void _starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
+	if (req->needs_unpacking)
+	{
+		starpu_handle_pack_data(req->data_handle, &req->ptr, &req->count);
+	}
+	else
+	{
+		req->count = 1;
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	}
+	STARPU_ASSERT(req->ptr);
+
+        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, req->count, &req->request);
+
+	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
+
+        req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
+
+	/* somebody is perhaps waiting for the MPI request to be posted */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	req->submitted = 1;
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
+							int dest, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg)
+{
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_func, STARPU_R);
+}
+
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 {
         _STARPU_MPI_LOG_IN();
@@ -139,12 +164,8 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	return 0;
 }
 
-/*
- *	Isend (detached)
- */
-
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
-				int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
         _STARPU_MPI_LOG_IN();
 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
@@ -153,26 +174,47 @@ int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 	return 0;
 }
 
-/*
- *	Irecv
- */
-
-static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
 {
-	int count;
+	starpu_mpi_req req;
+	MPI_Status status;
 
         _STARPU_MPI_LOG_IN();
+	memset(&status, 0, sizeof(MPI_Status));
 
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
+	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
+	starpu_mpi_wait(&req, &status);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/********************************************************/
+/*                                                      */
+/*  Receive functionalities                             */
+/*                                                      */
+/********************************************************/
+
+static void _starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
 	if (req->needs_unpacking == 1)
-		req->ptr = malloc(count);
+	{
+		req->count = starpu_handle_get_size(req->data_handle);
+		req->ptr = malloc(req->count);
+	}
 	else
+	{
+		req->count = 1;
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	}
 	STARPU_ASSERT(req->ptr);
 
 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
 
-        req->ret = MPI_Irecv(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
@@ -185,37 +227,7 @@ static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
 
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 {
-        _STARPU_MPI_LOG_IN();
-	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
-	STARPU_ASSERT(req);
-
-        INC_POSTED_REQUESTS(1);
-
-	/* Initialize the request structure */
-	req->submitted = 0;
-	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
-
-	req->request_type = RECV_REQ;
-
-	req->data_handle = data_handle;
-	req->srcdst = source;
-	req->mpi_tag = mpi_tag;
-	req->comm = comm;
-
-	req->detached = detached;
-	req->callback = callback;
-	req->callback_arg = arg;
-
-	req->func = starpu_mpi_irecv_func;
-
-	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, submit_mpi_req(req) is called and
-	 * the request is actually submitted  */
-	starpu_data_acquire_cb(data_handle, STARPU_W, submit_mpi_req, (void *)req);
-
-        _STARPU_MPI_LOG_OUT();
-	return req;
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_func, STARPU_W);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -233,24 +245,14 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	return 0;
 }
 
-/*
- *	Irecv (detached)
- */
-
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
         _STARPU_MPI_LOG_IN();
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
-
         _STARPU_MPI_LOG_OUT();
 	return 0;
 }
 
-
-/*
- *	Recv
- */
-
 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 {
 	starpu_mpi_req req;
@@ -263,30 +265,13 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 	return 0;
 }
 
-/*
- *	Send
- */
-
-int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
-{
-	starpu_mpi_req req;
-	MPI_Status status;
-
-        _STARPU_MPI_LOG_IN();
-	memset(&status, 0, sizeof(MPI_Status));
-
-	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
-	starpu_mpi_wait(&req, &status);
-
-        _STARPU_MPI_LOG_OUT();
-	return 0;
-}
-
-/*
- *	Wait
- */
+/********************************************************/
+/*                                                      */
+/*  Wait functionalities                                */
+/*                                                      */
+/********************************************************/
 
-static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
+static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are waiting for ? */
@@ -295,7 +280,7 @@ static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
-	handle_request_termination(req);
+	_starpu_mpi_handle_request_termination(req);
         _STARPU_MPI_LOG_OUT();
 }
 
@@ -307,7 +292,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	STARPU_ASSERT(waiting_req);
 	struct _starpu_mpi_req *req = *public_req;
 
-        INC_POSTED_REQUESTS(1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
@@ -321,10 +306,10 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
 	waiting_req->status = status;
 	waiting_req->other_request = req;
-	waiting_req->func = starpu_mpi_wait_func;
+	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 
-	submit_mpi_req(waiting_req);
+	_starpu_mpi_submit_new_mpi_request(waiting_req);
 
 	/* We wait for the MPI request to finish */
 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -343,24 +328,26 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	return ret;
 }
 
-/*
- * 	Test
- */
+/********************************************************/
+/*                                                      */
+/*  Test functionalities                                */
+/*                                                      */
+/********************************************************/
 
-static void starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
+static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are testing for ? */
 	struct _starpu_mpi_req *req = testing_req->other_request;
 
-        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 	if (*testing_req->flag)
 	{
 		testing_req->ret = req->ret;
-		handle_request_termination(req);
+		_starpu_mpi_handle_request_termination(req);
 	}
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
@@ -397,12 +384,12 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->flag = flag;
 		testing_req->status = status;
 		testing_req->other_request = req;
-		testing_req->func = starpu_mpi_test_func;
+		testing_req->func = _starpu_mpi_test_func;
 		testing_req->completed = 0;
                 testing_req->request_type = TEST_REQ;
 
-                INC_POSTED_REQUESTS(1);
-                submit_mpi_req(testing_req);
+                _STARPU_MPI_INC_POSTED_REQUESTS(1);
+                _starpu_mpi_submit_new_mpi_request(testing_req);
 
 		/* We wait for the test request to finish */
 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
@@ -421,7 +408,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 			free(req);
 		}
 	}
-	else {
+	else
+	{
 		*flag = 0;
 	}
 
@@ -429,18 +417,20 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	return ret;
 }
 
-/*
- *	Barrier
- */
+/********************************************************/
+/*                                                      */
+/*  Barrier functionalities                             */
+/*                                                      */
+/********************************************************/
 
-static void starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
+static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
         _STARPU_MPI_LOG_IN();
 
 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
         STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
 
-	handle_request_termination(barrier_req);
+	_starpu_mpi_handle_request_termination(barrier_req);
         _STARPU_MPI_LOG_OUT();
 }
 
@@ -457,7 +447,8 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
 	barrier_running = 1;
-	do {
+	do
+	{
 		while (posted_requests)
 			/* Wait for all current MPI requests to finish */
 			_STARPU_PTHREAD_COND_WAIT(&cond_finished, &mutex);
@@ -477,12 +468,12 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	/* Initialize the request structure */
 	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
 	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
-	barrier_req->func = starpu_mpi_barrier_func;
+	barrier_req->func = _starpu_mpi_barrier_func;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->comm = comm;
 
-        INC_POSTED_REQUESTS(1);
-	submit_mpi_req(barrier_req);
+        _STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_starpu_mpi_submit_new_mpi_request(barrier_req);
 
 	/* We wait for the MPI request to finish */
 	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
@@ -497,31 +488,34 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	return ret;
 }
 
-/*
- *	Requests
- */
+/********************************************************/
+/*                                                      */
+/*  Progression                                         */
+/*                                                      */
+/********************************************************/
 
 #ifdef STARPU_MPI_VERBOSE
-static char *starpu_mpi_request_type(unsigned request_type)
+static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 {
         switch (request_type)
                 {
-                case SEND_REQ: return "send";
-                case RECV_REQ: return "recv";
-                case WAIT_REQ: return "wait";
-                case TEST_REQ: return "test";
-                case BARRIER_REQ: return "barrier";
+                case SEND_REQ: return "SEND_REQ";
+                case RECV_REQ: return "RECV_REQ";
+                case WAIT_REQ: return "WAIT_REQ";
+                case TEST_REQ: return "TEST_REQ";
+                case BARRIER_REQ: return "BARRIER_REQ";
                 default: return "unknown request type";
                 }
 }
 #endif
 
-static void handle_request_termination(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
-        if (req->request_type != BARRIER_REQ) {
+	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", _starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
+        if (req->request_type != BARRIER_REQ)
+	{
 		if (req->needs_unpacking)
 			starpu_handle_unpack_data(req->data_handle, req->ptr);
 		else
@@ -547,28 +541,24 @@ static void handle_request_termination(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
 }
 
-static void submit_mpi_req(void *arg)
+static void _starpu_mpi_submit_new_mpi_request(void *arg)
 {
         _STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 
-        INC_POSTED_REQUESTS(-1);
+        _STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_starpu_mpi_req_list_push_front(new_requests, req);
 	newer_requests = 1;
-        _STARPU_MPI_DEBUG("Pushing new request type %d\n", req->request_type);
+        _STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
-/*
- *	Scheduler hook
- */
-
 #ifdef USE_STARPU_ACTIVITY
-static unsigned progression_hook_func(void *arg __attribute__((unused)))
+static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unused)))
 {
 	unsigned may_block = 1;
 
@@ -584,11 +574,7 @@ static unsigned progression_hook_func(void *arg __attribute__((unused)))
 }
 #endif
 
-/*
- *	Progression loop
- */
-
-static void test_detached_requests(void)
+static void _starpu_mpi_test_detached_requests(void)
 {
         _STARPU_MPI_LOG_IN();
 	int flag;
@@ -605,13 +591,13 @@ static void test_detached_requests(void)
 
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
-                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 		req->ret = MPI_Test(&req->request, &flag, &status);
 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 		if (flag)
 		{
-			handle_request_termination(req);
+			_starpu_mpi_handle_request_termination(req);
 		}
 
 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -631,13 +617,13 @@ static void test_detached_requests(void)
         _STARPU_MPI_LOG_OUT();
 }
 
-static void handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(req);
 
 	/* submit the request to MPI */
-        _STARPU_MPI_DEBUG("Handling new request type %d\n", req->request_type);
+        _STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
 	req->func(req);
 
 	if (req->detached)
@@ -657,29 +643,57 @@ static void handle_new_request(struct _starpu_mpi_req *req)
         _STARPU_MPI_LOG_OUT();
 }
 
-static void *progress_thread_func(void *arg)
+struct _starpu_mpi_argc_argv
 {
-        int initialize_mpi = *((int *) arg);
+	int *argc;
+	char ***argv;
+};
 
-        _STARPU_DEBUG("Initialize mpi: %d\n", initialize_mpi);
+static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
+{
+     switch (thread_level)
+     {
+     case MPI_THREAD_SERIALIZED:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
+	  break;
+     }
+     case MPI_THREAD_FUNNELED:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
+	  break;
+     }
+     case MPI_THREAD_SINGLE:
+     {
+	  _STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
+	  break;
+     }
+     }
+}
 
-        if (initialize_mpi) {
-#ifdef STARPU_DEVEL
-#warning get real argc and argv from the application
-#endif
-                int argc = 0;
-                char **argv = NULL;
-                int thread_support;
+static void *_starpu_mpi_progress_thread_func(void *arg)
+{
+	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
+	int flag;
+
+	MPI_Initialized(&flag);
+	_STARPU_DEBUG("MPI_Initialized %d\n", flag);
+	if (flag == 0)
+	{
+		int thread_support;
                 _STARPU_DEBUG("Calling MPI_Init_thread\n");
-                if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
-                        fprintf(stderr,"MPI_Init_thread failed\n");
-                        exit(1);
+		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+		{
+			_STARPU_ERROR("MPI_Init_thread failed\n");
                 }
-                if (thread_support == MPI_THREAD_FUNNELED)
-                        fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
-                if (thread_support < MPI_THREAD_FUNNELED)
-                        fprintf(stderr,"Warning: MPI does not have thread support!\n");
+		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
         }
+	else
+	{
+	     int provided;
+	     MPI_Query_thread(&provided);
+	     _starpu_mpi_print_thread_level_support(provided, " has been initialized with");
+	}
 
 	/* notify the main thread that the progression thread is ready */
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -688,7 +702,8 @@ static void *progress_thread_func(void *arg)
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests))) {
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
+	{
 		/* shall we block ? */
 		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 
@@ -707,7 +722,7 @@ static void *progress_thread_func(void *arg)
 
 		/* test whether there are some terminated "detached request" */
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-		test_detached_requests();
+		_starpu_mpi_test_detached_requests();
 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 		/* get one request */
@@ -721,7 +736,7 @@ static void *progress_thread_func(void *arg)
 			 * application submit requests in the meantime, so we
 			 * release the lock.  */
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-			handle_new_request(req);
+			_starpu_mpi_handle_new_request(req);
 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 	}
@@ -730,19 +745,23 @@ static void *progress_thread_func(void *arg)
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
         STARPU_ASSERT(posted_requests == 0);
 
-        if (initialize_mpi) {
+        if (flag == 0)
+	{
                 _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
                 MPI_Finalize();
         }
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
+	free(argc_argv);
 	return NULL;
 }
 
-/*
- *	(De)Initialization methods
- */
+/********************************************************/
+/*                                                      */
+/*  (De)Initialization methods                          */
+/*                                                      */
+/********************************************************/
 
 #ifdef USE_STARPU_ACTIVITY
 static int hookid = - 1;
@@ -780,7 +799,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 }
 
 static
-int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
+int _starpu_mpi_initialize(int *argc, char ***argv)
 {
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
@@ -792,20 +811,16 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 
         _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
 
-	_STARPU_PTHREAD_CREATE(&progress_thread, NULL,
-			       progress_thread_func, (void *)&initialize_mpi);
+	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
+	argc_argv->argc = argc;
+	argc_argv->argv = argv;
+	_STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (!running)
 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
-        if (rank && world_size) {
-                _STARPU_DEBUG("Calling MPI_Comm_rank\n");
-                MPI_Comm_rank(MPI_COMM_WORLD, rank);
-                MPI_Comm_size(MPI_COMM_WORLD, world_size);
-        }
-
 #ifdef STARPU_USE_FXT
 	int prank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &prank);
@@ -823,14 +838,28 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 	return 0;
 }
 
+int starpu_mpi_init(int *argc, char ***argv)
+{
+        return _starpu_mpi_initialize(argc, argv);
+}
+
 int starpu_mpi_initialize(void)
 {
-        return _starpu_mpi_initialize(0, NULL, NULL);
+        return _starpu_mpi_initialize(NULL, NULL);
 }
 
 int starpu_mpi_initialize_extended(int *rank, int *world_size)
 {
-        return _starpu_mpi_initialize(1, rank, world_size);
+	int ret;
+
+        ret = _starpu_mpi_initialize(NULL, NULL);
+	if (ret == 0)
+	{
+		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
+		MPI_Comm_rank(MPI_COMM_WORLD, rank);
+		MPI_Comm_size(MPI_COMM_WORLD, world_size);
+	}
+	return ret;
 }
 
 int starpu_mpi_shutdown(void)

+ 104 - 7
mpi/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,14 +17,72 @@
 #include <mpi.h>
 #include <starpu.h>
 #include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+struct _callback_arg
+{
+	void (*callback)(void *);
+	void *arg;
+	int nb;
+	int count;
+};
+
+void _callback_collective(void *arg)
+{
+	struct _callback_arg *callback_arg = arg;
+	callback_arg->nb ++;
+	if (callback_arg->nb == callback_arg->count)
+	{
+		callback_arg->callback(callback_arg->arg);
+	}
+}
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 {
 	int rank;
 	int x;
+	struct _callback_arg *callback_arg;
+	void (*callback_func)(void *);
 
 	MPI_Comm_rank(comm, &rank);
 
+#ifdef STARPU_DEVEL
+#warning callback_arg needs to be free-ed
+#endif
+	callback_func = _callback_collective;
+	callback_arg = malloc(sizeof(struct _callback_arg));
+	callback_arg->count = 0;
+	callback_arg->nb = 0;
+	callback_arg->callback = (rank == root) ? scallback : rcallback;
+	callback_arg->arg = (rank == root) ? sarg : rarg;
+	if (callback_arg->callback == NULL)
+	{
+		free(callback_arg);
+		callback_arg = NULL;
+		callback_func = NULL;
+	}
+
+	if (callback_arg)
+	{
+		for(x = 0; x < count ;  x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_data_get_tag(data_handles[x]);
+				STARPU_ASSERT(mpi_tag >= 0);
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
 	for(x = 0; x < count ;  x++)
 	{
 		if (data_handles[x])
@@ -35,25 +93,64 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 			if ((rank == root) && (owner != root))
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
-				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
 			}
 			if ((rank != root) && (owner == rank))
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
-				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
 			}
 		}
 	}
 	return 0;
 }
 
-int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 {
 	int rank;
 	int x;
+	struct _callback_arg *callback_arg;
+	void (*callback_func)(void *);
 
 	MPI_Comm_rank(comm, &rank);
 
+#ifdef STARPU_DEVEL
+#warning callback_arg needs to be free-ed
+#endif
+	callback_func = _callback_collective;
+	callback_arg = malloc(sizeof(struct _callback_arg));
+	callback_arg->count = 0;
+	callback_arg->nb = 0;
+	callback_arg->callback = (rank == root) ? scallback : rcallback;
+	callback_arg->arg = (rank == root) ? sarg : rarg;
+	if (callback_arg->callback == NULL)
+	{
+		free(callback_arg);
+		callback_arg = NULL;
+		callback_func = NULL;
+	}
+
+	if (callback_arg)
+	{
+		for(x = 0; x < count ;  x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_data_get_tag(data_handles[x]);
+				STARPU_ASSERT(mpi_tag >= 0);
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
 	for(x = 0; x < count ;  x++)
 	{
 		if (data_handles[x])
@@ -64,12 +161,12 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 			if ((rank == root) && (owner != root))
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
-				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
 			}
 			if ((rank != root) && (owner == rank))
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
-				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
 			}
 		}
 	}

+ 1 - 3
mpi/src/starpu_mpi_datatype.c

@@ -127,7 +127,7 @@ static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID]
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count)
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
 
@@ -136,13 +136,11 @@ int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype
 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
 		STARPU_ASSERT(func);
 		func(data_handle, datatype);
-		*count = 1;
 		return 0;
 	}
 	else
 	{
 		/* The datatype is not predefined by StarPU */
-		*count = starpu_handle_get_size(data_handle);
 		*datatype = MPI_BYTE;
 		return 1;
 	}

+ 1 - 1
mpi/src/starpu_mpi_datatype.h

@@ -24,7 +24,7 @@
 extern "C" {
 #endif
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count);
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
 #ifdef __cplusplus
 }

+ 5 - 4
mpi/src/starpu_mpi_helper.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,7 @@ int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	*tagptr = tag;
-	
+
 	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
 						starpu_mpi_unlock_tag_callback, tagptr);
 }
@@ -41,12 +41,13 @@ int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int s
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	*tagptr = tag;
-	
+
 	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
 						starpu_mpi_unlock_tag_callback, tagptr);
 }
 
-struct arg_array {
+struct arg_array
+{
 	int array_size;
 	starpu_tag_t tag;
 };

+ 230 - 132
mpi/src/starpu_mpi_insert_task.c

@@ -28,7 +28,6 @@
 //#define STARPU_MPI_VERBOSE 1
 #include <starpu_mpi_private.h>
 
-#ifdef STARPU_MPI_CACHE
 /* Whether we are allowed to keep copies of remote data. */
 struct _starpu_data_entry
 {
@@ -36,32 +35,41 @@ struct _starpu_data_entry
 	void *data;
 };
 
-struct _starpu_data_entry **sent_data = NULL;
-struct _starpu_data_entry **received_data = NULL;
-#endif /* STARPU_MPI_CACHE */
+static struct _starpu_data_entry **sent_data = NULL;
+static struct _starpu_data_entry **received_data = NULL;
+static int cache_enabled=1;
 
 void _starpu_mpi_tables_init(MPI_Comm comm)
 {
-#ifdef STARPU_MPI_CACHE
 	int nb_nodes;
 	int i;
 
+	cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
+	if (cache_enabled == -1)
+	{
+		cache_enabled = 1;
+	}
+
+	if (cache_enabled == 0)
+	{
+		if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
+		return;
+	}
+
 	MPI_Comm_size(comm, &nb_nodes);
 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
 	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
 	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
 	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
-#else
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
-#endif /* STARPU_MPI_CACHE */
 }
 
 void _starpu_mpi_tables_free(int world_size)
 {
-#ifdef STARPU_MPI_CACHE
 	int i;
 
+	if (cache_enabled == 0) return;
+
 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
 
 	for(i=0 ; i<world_size ; i++)
@@ -80,13 +88,13 @@ void _starpu_mpi_tables_free(int world_size)
 	}
 	free(sent_data);
 	free(received_data);
-#endif
 }
 
 static
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 {
-	if (data && mode & STARPU_R) {
+	if (data && mode & STARPU_R)
+	{
 		struct starpu_data_interface_ops *ops;
 		int rank = starpu_data_get_rank(data);
 
@@ -94,8 +102,10 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 		size_on_nodes[rank] += ops->get_size(data);
 	}
 
-	if (mode & STARPU_W) {
-		if (!data) {
+	if (mode & STARPU_W)
+	{
+		if (!data)
+		{
 			/* We don't have anything allocated for this.
 			 * The application knows we won't do anything
 			 * about this task */
@@ -107,25 +117,32 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 			return -EINVAL;
 		}
 		int mpi_rank = starpu_data_get_rank(data);
-		if (mpi_rank == me) {
-			if (*do_execute == 0) {
+		if (mpi_rank == me)
+		{
+			if (*do_execute == 0)
+			{
 				*inconsistent_execute = 1;
 			}
-			else {
+			else
+			{
 				*do_execute = 1;
 			}
 		}
-		else if (mpi_rank != -1) {
-			if (*do_execute == 1) {
+		else if (mpi_rank != -1)
+		{
+			if (*do_execute == 1)
+			{
 				*inconsistent_execute = 1;
 			}
-			else {
+			else
+			{
 				*do_execute = 0;
 				*dest = mpi_rank;
 				/* That's the rank which needs the data to be sent to */
 			}
 		}
-		else {
+		else
+		{
 			_STARPU_ERROR("rank invalid\n");
 		}
 	}
@@ -135,60 +152,64 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 static
 void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
 {
-#ifdef STARPU_MPI_CACHE
+	if (cache_enabled == 0) return NULL;
+
 	struct _starpu_data_entry *already_received;
 	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-	if (already_received == NULL) {
+	if (already_received == NULL)
+	{
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		entry->data = data;
 		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
 	}
 	return already_received;
-#else
-	return NULL;
-#endif
 }
 
 static
 void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
 {
-#ifdef STARPU_MPI_CACHE
+	if (cache_enabled == 0) return NULL;
+
 	struct _starpu_data_entry *already_sent;
 	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
-	if (already_sent == NULL) {
+	if (already_sent == NULL)
+	{
 		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
 		entry->data = data;
 		HASH_ADD_PTR(sent_data[dest], data, entry);
 		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
 	}
 	return already_sent;
-#else
-	return NULL;
-#endif
 }
 
 static
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 {
-	if (data && mode & STARPU_R) {
+	if (data && mode & STARPU_R)
+	{
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		if(mpi_rank == -1) {
+		if(mpi_rank == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			STARPU_ABORT();
 		}
-		if(mpi_tag == -1) {
+		if(mpi_tag == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			STARPU_ABORT();
 		}
 		/* The task needs to read this data */
-		if (do_execute && mpi_rank != me && mpi_rank != -1) {
+		if (do_execute && mpi_rank != me && mpi_rank != -1)
+		{
 			/* I will have to execute but I don't have the data, receive */
 			void *already_received = _starpu_mpi_already_received(data, mpi_rank);
 			if (already_received == NULL)
@@ -197,7 +218,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 			}
 		}
-		if (!do_execute && mpi_rank == me) {
+		if (!do_execute && mpi_rank == me)
+		{
 			/* Somebody else will execute it, and I have the data, send it. */
 			void *already_sent = _starpu_mpi_already_sent(data, dest);
 			if (already_sent == NULL)
@@ -212,24 +234,30 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 static
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
 {
-	if (mode & STARPU_W) {
+	if (mode & STARPU_W)
+	{
 		int mpi_rank = starpu_data_get_rank(data);
 		int mpi_tag = starpu_data_get_tag(data);
-		if(mpi_rank == -1) {
+		if(mpi_rank == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 			STARPU_ABORT();
 		}
-		if(mpi_tag == -1) {
+		if(mpi_tag == -1)
+		{
 			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 			STARPU_ABORT();
 		}
-		if (mpi_rank == me) {
-			if (xrank != -1 && me != xrank) {
+		if (mpi_rank == me)
+		{
+			if (xrank != -1 && me != xrank)
+			{
 				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
 				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
 			}
 		}
-		else if (do_execute) {
+		else if (do_execute)
+		{
 			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
 			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 		}
@@ -238,43 +266,54 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
 {
-#ifdef STARPU_MPI_CACHE
-	if (mode & STARPU_W) {
-		if (do_execute) {
-			/* Note that all copies I've sent to neighbours are now invalid */
-			int n, size;
-			MPI_Comm_size(comm, &size);
-			for(n=0 ; n<size ; n++) {
-				struct _starpu_data_entry *already_sent;
-				HASH_FIND_PTR(sent_data[n], &data, already_sent);
-				if (already_sent) {
-					_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
-					HASH_DEL(sent_data[n], already_sent);
+	if (cache_enabled)
+	{
+		if (mode & STARPU_W)
+		{
+			if (do_execute)
+			{
+				/* Note that all copies I've sent to neighbours are now invalid */
+				int n, size;
+				MPI_Comm_size(comm, &size);
+				for(n=0 ; n<size ; n++)
+				{
+					struct _starpu_data_entry *already_sent;
+					HASH_FIND_PTR(sent_data[n], &data, already_sent);
+					if (already_sent)
+					{
+						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
+						HASH_DEL(sent_data[n], already_sent);
+					}
+				}
+			}
+			else
+			{
+				int mpi_rank = starpu_data_get_rank(data);
+				struct _starpu_data_entry *already_received;
+				HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+				if (already_received)
+				{
+					/* Somebody else will write to the data, so discard our cached copy if any */
+					/* TODO: starpu_mpi could just remember itself. */
+					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
+					HASH_DEL(received_data[mpi_rank], already_received);
+					starpu_data_invalidate_submit(data);
 				}
 			}
 		}
-		else {
+	}
+	else
+	{
+		/* We allocated a temporary buffer for the received data, now drop it */
+		if ((mode & STARPU_R) && do_execute)
+		{
 			int mpi_rank = starpu_data_get_rank(data);
-			struct _starpu_data_entry *already_received;
-			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-			if (already_received) {
-				/* Somebody else will write to the data, so discard our cached copy if any */
-				/* TODO: starpu_mpi could just remember itself. */
-				_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
-				HASH_DEL(received_data[mpi_rank], already_received);
+			if (mpi_rank != me && mpi_rank != -1)
+			{
 				starpu_data_invalidate_submit(data);
 			}
 		}
 	}
-#else
-	/* We allocated a temporary buffer for the received data, now drop it */
-	if ((mode & STARPU_R) && do_execute) {
-		int mpi_rank = starpu_data_get_rank(data);
-		if (mpi_rank != me && mpi_rank != -1) {
-			starpu_data_invalidate_submit(data);
-		}
-	}
-#endif
 }
 
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
@@ -307,20 +346,24 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	do_execute = -1;
 	xrank = -1;
 	va_start(varg_list, codelet);
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_EXECUTE_ON_NODE) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			xrank = va_arg(varg_list, int);
 			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
 			do_execute = 1;
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			xrank = starpu_data_get_rank(data);
 			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
 			STARPU_ASSERT(xrank <= nb_nodes);
 			do_execute = 1;
 		}
-		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
@@ -348,36 +391,45 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data ++;
 			}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, int);
 		}
 	}
 	va_end(varg_list);
 
-	if (do_execute == -1) {
+	if (do_execute == -1)
+	{
 		int i;
 		size_t max_size = 0;
-		for(i=0 ; i<nb_nodes ; i++) {
+		for(i=0 ; i<nb_nodes ; i++)
+		{
 			if (size_on_nodes[i] > max_size)
 			{
 				max_size = size_on_nodes[i];
@@ -385,7 +437,8 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			}
 		}
 		free(size_on_nodes);
-		if (xrank != -1) {
+		if (xrank != -1)
+		{
 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
 			do_execute = 1;
 		}
@@ -393,18 +446,22 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 
-	if (inconsistent_execute == 1) {
-		if (xrank == -1) {
+	if (inconsistent_execute == 1)
+	{
+		if (xrank == -1)
+		{
 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
 			free(size_on_nodes);
 			return -EINVAL;
 		}
-		else {
+		else
+		{
 			do_execute = (me == xrank);
 			dest = xrank;
 		}
 	}
-	else if (xrank != -1) {
+	else if (xrank != -1)
+	{
 		do_execute = (me == xrank);
 		dest = xrank;
 	}
@@ -412,8 +469,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	/* Send and receive data as requested */
 	va_start(varg_list, codelet);
 	current_data = 0;
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
@@ -433,33 +492,41 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data++;
 			}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, starpu_data_handle_t);
 		}
 	}
 	va_end(varg_list);
 
-	if (do_execute) {
+	if (do_execute)
+	{
 		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		va_start(varg_list, codelet);
 		struct starpu_task *task = starpu_task_create();
@@ -468,11 +535,14 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		STARPU_ASSERT(ret==0);
 	}
 
-	if (inconsistent_execute) {
+	if (inconsistent_execute)
+	{
 		va_start(varg_list, codelet);
 		current_data = 0;
-		while ((arg_type = va_arg(varg_list, int)) != 0) {
-			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+		while ((arg_type = va_arg(varg_list, int)) != 0)
+		{
+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX)
+			{
 				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
@@ -491,27 +561,34 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 					current_data++;
 				}
 			}
-			else if (arg_type==STARPU_VALUE) {
+			else if (arg_type==STARPU_VALUE)
+			{
 				va_arg(varg_list, void *);
 				va_arg(varg_list, size_t);
 			}
-			else if (arg_type==STARPU_CALLBACK) {
+			else if (arg_type==STARPU_CALLBACK)
+			{
 				va_arg(varg_list, void (*)(void *));
 			}
-			else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+			{
 				va_arg(varg_list, void (*)(void *));
 				va_arg(varg_list, void *);
 			}
-			else if (arg_type==STARPU_CALLBACK_ARG) {
+			else if (arg_type==STARPU_CALLBACK_ARG)
+			{
 				va_arg(varg_list, void *);
 			}
-			else if (arg_type==STARPU_PRIORITY) {
+			else if (arg_type==STARPU_PRIORITY)
+			{
 				va_arg(varg_list, int);
 			}
-			else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			else if (arg_type==STARPU_EXECUTE_ON_NODE)
+			{
 				va_arg(varg_list, int);
 			}
-			else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			else if (arg_type==STARPU_EXECUTE_ON_DATA)
+			{
 				va_arg(varg_list, starpu_data_handle_t);
 			}
 		}
@@ -520,8 +597,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 	va_start(varg_list, codelet);
 	current_data = 0;
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX) {
+	while ((arg_type = va_arg(varg_list, int)) != 0)
+	{
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX)
+		{
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
 
@@ -540,27 +619,34 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				current_data++;
 			}
 		}
-		else if (arg_type==STARPU_VALUE) {
+		else if (arg_type==STARPU_VALUE)
+		{
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
 		}
-		else if (arg_type==STARPU_CALLBACK) {
+		else if (arg_type==STARPU_CALLBACK)
+		{
 			va_arg(varg_list, void (*)(void *));
 		}
-		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
 			va_arg(varg_list, void (*)(void *));
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
 			va_arg(varg_list, void *);
 		}
-		else if (arg_type==STARPU_PRIORITY) {
+		else if (arg_type==STARPU_PRIORITY)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+		else if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
 			va_arg(varg_list, int);
 		}
-		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
 			va_arg(varg_list, starpu_data_handle_t);
 		}
 	}
@@ -576,11 +662,13 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 	}
@@ -604,11 +692,13 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 	}
@@ -633,11 +723,13 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 	rank = starpu_data_get_rank(data_handle);
 	tag = starpu_data_get_tag(data_handle);
-	if(rank == -1) {
+	if (rank == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 		STARPU_ABORT();
 	}
-	if(tag == -1) {
+	if (tag == -1)
+	{
 		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 		STARPU_ABORT();
 	}
@@ -648,11 +740,14 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
 
 	// need to count how many nodes have the data in redux mode
-	if (me == rank) {
+	if (me == rank)
+	{
 		int i;
 
-		for(i=0 ; i<nb_nodes ; i++) {
-			if (i != rank) {
+		for(i=0 ; i<nb_nodes ; i++)
+		{
+			if (i != rank)
+			{
 				starpu_data_handle_t new_handle;
 
 				starpu_data_register_same(&new_handle, data_handle);
@@ -661,14 +756,17 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
 				starpu_insert_task(data_handle->redux_cl,
-						STARPU_RW, data_handle,
-						STARPU_R, new_handle,
-						0);
+						   STARPU_RW, data_handle,
+						   STARPU_R, new_handle,
+						   0);
+				starpu_data_unregister_submit(new_handle);
 			}
 		}
 	}
-	else {
+	else
+	{
 		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+		starpu_insert_task(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 }

+ 15 - 6
mpi/src/starpu_mpi_private.h

@@ -41,6 +41,11 @@ extern "C" {
 #  define _STARPU_MPI_DEBUG(fmt, args ...)
 #endif
 
+#define _STARPU_MPI_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
+                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
+
 #ifdef STARPU_MPI_VERBOSE0
 #  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
                                                int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
@@ -55,11 +60,14 @@ extern "C" {
 #  define _STARPU_MPI_LOG_OUT()
 #endif
 
-#define SEND_REQ	0
-#define RECV_REQ	1
-#define WAIT_REQ        2
-#define TEST_REQ        3
-#define BARRIER_REQ     4
+enum _starpu_mpi_request_type
+{
+	SEND_REQ=0,
+	RECV_REQ=1,
+	WAIT_REQ=2,
+	TEST_REQ=3,
+	BARRIER_REQ=4
+};
 
 LIST_TYPE(_starpu_mpi_req,
 	/* description of the data at StarPU level */
@@ -68,6 +76,7 @@ LIST_TYPE(_starpu_mpi_req,
 	/* description of the data to be sent/received */
 	MPI_Datatype datatype;
 	void *ptr;
+	size_t count;
 	int needs_unpacking;
 
 	/* who are we talking to ? */
@@ -85,7 +94,7 @@ LIST_TYPE(_starpu_mpi_req,
 	pthread_mutex_t req_mutex;
 	pthread_cond_t req_cond;
 
-	unsigned request_type; /* 0 send, 1 recv */
+	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
 
 	unsigned submitted;
 	unsigned completed;

+ 7 - 5
mpi/src/starpu_mpi_stats.c

@@ -27,19 +27,15 @@ static int stats_enabled=0;
 
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 {
-#ifdef STARPU_COMM_STATS
-	stats_enabled = 1;
-#else
 	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
 	if (stats_enabled == -1)
 	{
 		stats_enabled = 0;
 	}
-#endif /* STARPU_COMM_STATS */
 
 	if (stats_enabled == 0) return;
 
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats or is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
 	MPI_Comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
@@ -67,12 +63,18 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 	comm_amount[dst] += count*size;
 }
 
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
+{
+	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
+}
+
 void _starpu_mpi_comm_amounts_display(int node)
 {
 	unsigned dst;
 	size_t sum = 0;
 
 	if (stats_enabled == 0) return;
+	if (getenv("STARPU_SILENT")) return;
 
 	for (dst = 0; dst < world_size; dst++)
 	{

+ 14 - 4
mpi/tests/Makefile.am

@@ -19,9 +19,9 @@ CCLD=$(MPICC)
 
 if STARPU_MPI_CHECK
 if STARPU_HAVE_AM111
-LOG_COMPILER	 	=	$(MPIEXEC) -np 2
+LOG_COMPILER	 	=	$(MPIEXEC) -np 4
 else
-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 4
 endif
 TESTS			=	$(check_PROGRAMS)
 endif
@@ -78,7 +78,9 @@ check_PROGRAMS +=				\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
-	multiple_send
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction
 
 noinst_PROGRAMS =				\
 	pingpong				\
@@ -99,7 +101,9 @@ noinst_PROGRAMS =				\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
-	multiple_send
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction
 
 mpi_isend_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -139,6 +143,10 @@ insert_task_owner_data_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 multiple_send_LDADD =				\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_scatter_gather_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_reduction_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 ring_SOURCES = ring.c
 ring_async_SOURCES = ring_async.c
@@ -148,6 +156,8 @@ ring_SOURCES += ring_kernel.cu
 ring_async_SOURCES += ring_kernel.cu
 ring_async_implicit_SOURCES += ring_kernel.cu
 endif
+mpi_reduction_SOURCES = mpi_reduction.c
+mpi_reduction_SOURCES += mpi_reduction_kernels.c
 
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 6 - 13
mpi/tests/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,23 +35,16 @@ int main(int argc, char **argv)
 	if (size < 2)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 
-	/* We only use 2 nodes for that test */
-	if (rank >= 2)
-	{
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
-
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -79,7 +72,7 @@ int main(int argc, char **argv)
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			SIZE, SIZE, SIZE, sizeof(float));
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
 		assert(block);
@@ -112,7 +105,7 @@ int main(int argc, char **argv)
 		starpu_data_release(block_handle);
 
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 		MPI_Status status;
 		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);

+ 6 - 13
mpi/tests/block_interface_pinned.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,23 +35,16 @@ int main(int argc, char **argv)
 	if (size < 2)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 
-	/* We only use 2 nodes for that test */
-	if (rank >= 2)
-	{
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
-
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -80,7 +73,7 @@ int main(int argc, char **argv)
 			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
 			SIZE, SIZE, SIZE, sizeof(float));
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 		starpu_malloc((void **)&block,
 			SIZE*SIZE*SIZE*sizeof(float));
@@ -113,7 +106,7 @@ int main(int argc, char **argv)
 		starpu_data_release(block_handle);
 
 	}
-	else /* rank == 1 */
+	else if (rank == 1)
 	{
 		MPI_Status status;
 

+ 4 - 0
mpi/tests/helper.h

@@ -19,4 +19,8 @@
 #define STARPU_TEST_SKIPPED 77
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define FPRINTF_MPI(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+                                                fprintf(stderr, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
 

+ 5 - 8
mpi/tests/insert_task.c

@@ -54,8 +54,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
         for(x = 0; x < X; x++)
 	{
@@ -85,17 +87,12 @@ int main(int argc, char **argv)
                                 //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+                        else
 			{
                                 /* I don't own that index, but will need it for my computations */
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                         }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
                         if (data_handles[x][y])
 			{
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);

+ 4 - 7
mpi/tests/insert_task_block.c

@@ -71,8 +71,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_mpi_init(&argc, &argv);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
         for(x = 0; x < SIZE; x++)
 	{
@@ -103,18 +105,13 @@ int main(int argc, char **argv)
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+                        else
 			{
                                 /* I don't own that index, but will need it for my computations */
                                 //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
                         if (data_handles[x][y])
 			{
                                 starpu_data_set_rank(data_handles[x][y], mpi_rank);

+ 84 - 93
mpi/tests/insert_task_cache.c

@@ -14,17 +14,21 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <starpu.h>
 #include <starpu_mpi.h>
 #include <math.h>
 #include "helper.h"
 
-void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+#if !defined(STARPU_HAVE_UNSETENV)
+#warning unsetenv is not defined. Skipping test
+int main(int argc, char **argv)
 {
-	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	return STARPU_TEST_SKIPPED;
+}
+#else
 
-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
-        *x = (*x + *y) / 2;
+void func_cpu(__attribute__ ((unused)) void *descr[], __attribute__ ((unused)) void *_args)
+{
 }
 
 struct starpu_codelet mycodelet =
@@ -35,118 +39,105 @@ struct starpu_codelet mycodelet =
 	.modes = {STARPU_RW, STARPU_R}
 };
 
-#define X     4
-#define Y     5
+#define N     1000
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes)
+int my_distrib(int x)
 {
-        return x % nb_nodes;
+        return x;
 }
 
-
-int main(int argc, char **argv)
+void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 {
-        int rank, size, x, y;
-        int ret,value=0;
-        unsigned matrix[X][Y];
-        starpu_data_handle_t data_handles[X][Y];
+        int i;
+        int ret;
+	unsigned v[2][N];
+        starpu_data_handle_t data_handles[2];
+	char *string;
+
+	string = malloc(50);
+	sprintf(string, "STARPU_MPI_CACHE=%d", enabled);
+	putenv(string);
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 2; i++)
 	{
-                for (y = 0; y < Y; y++)
+		int mpi_rank = my_distrib(i);
+		if (mpi_rank == rank)
 		{
-                        matrix[x][y] = (rank+1)*10 + value;
-                        value++;
-                }
-        }
-#if 0
-        for(x = 0; x < X; x++)
-	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++)
+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
+		}
+		else
 		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
+			/* I don't own that index, but will need it for my computations */
+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
+		}
+		starpu_data_set_rank(data_handles[i], mpi_rank);
+		starpu_data_set_tag(data_handles[i], i);
         }
-#endif
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 5; i++)
 	{
-                for (y = 0; y < Y; y++)
-		{
-                        int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank)
-			{
-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
-                        }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
-			{
-                                /* I don't own that index, but will need it for my computations */
-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
-                        }
-                        else
-			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
-                        if (data_handles[x][y])
-			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
-			}
-                }
-        }
-
-	mycodelet.name = "codelet1";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
 
-	mycodelet.name = "codelet2";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet3";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet4";
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
 
-        FPRINTF(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 
-        for(x = 0; x < X; x++)
+        for(i = 0; i < 2; i++)
 	{
-                for (y = 0; y < Y; y++)
-		{
-                        if (data_handles[x][y])
-                                starpu_data_unregister(data_handles[x][y]);
-                }
+		starpu_data_unregister(data_handles[i]);
         }
+
+	starpu_mpi_comm_amounts_retrieve(comm_amount);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
+}
 
-#if 0
-        for(x = 0; x < X; x++)
-	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++)
-		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
-        }
-#endif
+int main(int argc, char **argv)
+{
+	int dst, rank, size;
+	int result=0;
+	size_t *comm_amount_with_cache;
+	size_t *comm_amount_without_cache;
+	char *string;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	string = malloc(50);
+	sprintf(string, "STARPU_COMM_STATS=1");
+	putenv(string);
+
+	comm_amount_with_cache = malloc(size * sizeof(size_t));
+	comm_amount_without_cache = malloc(size * sizeof(size_t));
 
-	return 0;
+	test_cache(rank, size, 0, comm_amount_with_cache);
+	test_cache(rank, size, 1, comm_amount_without_cache);
+
+	if (rank == 0 || rank == 1)
+	{
+		dst = (rank == 0) ? 1 : 0;
+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
+		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
+	}
+	else
+		result = 1;
+
+	MPI_Finalize();
+	return !result;
 }
+#endif

+ 7 - 10
mpi/tests/insert_task_owner.c

@@ -79,16 +79,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
-
-        if (size != 2)
-	{
-		if (rank == 0) FPRINTF(stderr, "We need exactly 2 processes.\n");
-                starpu_mpi_shutdown();
-                starpu_shutdown();
-                return STARPU_TEST_SKIPPED;
-        }
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
         if (rank == 0)
 	{
@@ -109,6 +103,8 @@ int main(int argc, char **argv)
 		starpu_data_set_tag(data_handlesx0, 0);
         }
 
+	if (rank != 0 && rank != 1) goto end;
+
 	node = starpu_data_get_rank(data_handlesx1);
         err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
@@ -170,6 +166,7 @@ int main(int argc, char **argv)
 				     0);
         assert(err == 0);
 
+end:
 	fprintf(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 	starpu_mpi_shutdown();

+ 4 - 2
mpi/tests/insert_task_owner2.c

@@ -55,8 +55,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
         if (rank == 0)
 	{

+ 4 - 2
mpi/tests/insert_task_owner_data.c

@@ -45,8 +45,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
         if (rank == 0)
 	{

+ 7 - 8
mpi/tests/mpi_detached_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -52,14 +52,13 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
 		starpu_tag_t tag = (starpu_tag_t)loop;
 
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
 		}

+ 7 - 8
mpi/tests/mpi_irecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -52,12 +52,11 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}

+ 7 - 8
mpi/tests/mpi_irecv_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,10 +47,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -58,8 +58,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -67,12 +67,11 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		if (rank == 0)
+		if ((loop % 2) == (rank%2))
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}

+ 7 - 8
mpi/tests/mpi_isend.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -52,12 +52,11 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
 			MPI_Status status;
 			starpu_mpi_req req;

+ 7 - 8
mpi/tests/mpi_isend_detached.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,10 +47,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -58,8 +58,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -67,12 +67,11 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		if (rank == 0)
+		if ((loop % 2) == (rank%2))
 		{
 			int sent = 0;
 			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);

+ 30 - 11
mpi/examples/reduction/mpi_reduction.c

@@ -20,6 +20,7 @@
 extern void init_cpu_func(void *descr[], void *cl_arg);
 extern void redux_cpu_func(void *descr[], void *cl_arg);
 extern void dot_cpu_func(void *descr[], void *cl_arg);
+extern void display_cpu_func(void *descr[], void *cl_arg);
 
 static struct starpu_codelet init_codelet =
 {
@@ -46,6 +47,15 @@ static struct starpu_codelet dot_codelet =
 	.name = "dot_codelet"
 };
 
+static struct starpu_codelet display_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {display_cpu_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.name = "display_codelet"
+};
+
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int nb_nodes)
 {
@@ -54,20 +64,24 @@ int my_distrib(int x, int nb_nodes)
 
 int main(int argc, char **argv)
 {
-        int my_rank, size, x, y;
+	int my_rank, size, x, y, i;
         long int *vector;
 	long int dot, sum=0;
         starpu_data_handle_t *handles;
 	starpu_data_handle_t dot_handle;
 
-	int nb_elements, step;
+	int nb_elements, step, loops;
 
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&my_rank, &size);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	nb_elements = size*8000;
 	step = 4;
+	loops = 5;
 
 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
         for(x = 0; x < nb_elements; x+=step)
@@ -84,7 +98,8 @@ int main(int argc, char **argv)
 	if (my_rank == 0) {
 		dot = 14;
 		sum = (nb_elements * (nb_elements + 1)) / 2;
-		sum+= dot;
+		sum *= loops;
+		sum += dot;
 		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
 	}
 	else
@@ -117,15 +132,19 @@ int main(int argc, char **argv)
 	starpu_data_set_tag(dot_handle, nb_elements+1);
 	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
 
-	for (x = 0; x < nb_elements; x+=step)
+	for (i = 0; i < loops; i++)
 	{
-		starpu_mpi_insert_task(MPI_COMM_WORLD,
-				       &dot_codelet,
-				       STARPU_R, handles[x],
-				       STARPU_REDUX, dot_handle,
-				       0);
+		for (x = 0; x < nb_elements; x+=step)
+		{
+			starpu_mpi_insert_task(MPI_COMM_WORLD,
+					       &dot_codelet,
+					       STARPU_R, handles[x],
+					       STARPU_REDUX, dot_handle,
+					       0);
+		}
+		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
 	}
-	starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
 
         fprintf(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();

+ 10 - 0
mpi/examples/reduction/mpi_reduction_kernels.c

@@ -64,3 +64,13 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 //	_DISPLAY("After dot=%ld\n", *dot);
 }
 
+/*
+ *	Display codelet
+ */
+void display_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	_DISPLAY("Local=%ld\n", *local_x);
+}
+

+ 18 - 3
mpi/examples/scatter_gather/mpi_scatter_gather.c

@@ -54,6 +54,18 @@ static struct starpu_codelet cl =
 	.modes = {STARPU_RW},
 };
 
+void scallback(void *arg __attribute__((unused)))
+{
+	char *msg = arg;
+	fprintf(stderr, "Sending completed for <%s>\n", msg);
+}
+
+void rcallback(void *arg __attribute__((unused)))
+{
+	char *msg = arg;
+	fprintf(stderr, "Reception completed for <%s>\n", msg);
+}
+
 int main(int argc, char **argv)
 {
         int rank, nodes;
@@ -69,7 +81,10 @@ int main(int argc, char **argv)
 
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_initialize_extended(&rank, &nodes);
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 
 	if (rank == 0)
 	{
@@ -152,7 +167,7 @@ int main(int argc, char **argv)
         }
 
 	/* Scatter the matrix among the nodes */
-	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
 
 	/* Calculation */
 	for(x = 0; x < nblocks*nblocks ;  x++)
@@ -172,7 +187,7 @@ int main(int argc, char **argv)
 	}
 
 	/* Gather the matrix on main node */
-	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "gather", rcallback, "gather");
 
 	/* Unregister matrix from StarPU */
 	for(x=0 ; x<nblocks*nblocks ; x++)

+ 7 - 8
mpi/tests/mpi_test.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -52,14 +52,13 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
 		starpu_mpi_req req;
 
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
                         starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 		}

+ 5 - 3
mpi/tests/multiple_send.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,8 +30,10 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(&argc, &argv);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{

+ 9 - 7
mpi/tests/pingpong.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,10 +32,10 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-	if (size != 2)
+	if (size%2 != 0)
 	{
 		if (rank == 0)
-			FPRINTF(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need a even number of processes.\n");
 
 		MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
@@ -43,8 +43,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -52,17 +52,19 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-	int other_rank = (rank + 1)%2;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		if ((loop % 2) == rank)
+		if ((loop % 2) == (rank%2))
 		{
+			//FPRINTF_MPI("Sending to %d\n", other_rank);
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
 		else
 		{
 			MPI_Status status;
+			//FPRINTF_MPI("Receiving from %d\n", other_rank);
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 		}
 	}

+ 7 - 3
mpi/tests/ring.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 unsigned token = 42;
 starpu_data_handle_t token_handle;
@@ -75,8 +79,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 

+ 7 - 3
mpi/tests/ring_async.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 unsigned token = 42;
 starpu_data_handle_t token_handle;
@@ -75,8 +79,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize();
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 

+ 9 - 11
mpi/tests/ring_async_implicit.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
 
 unsigned token = 42;
 starpu_data_handle_t token_handle;
@@ -59,16 +63,12 @@ int main(int argc, char **argv)
 {
 	int ret, rank, size;
 
-#if 0
-	MPI_Init(NULL, NULL);
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
-#endif
-
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_initialize_extended(&rank, &size);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	ret = starpu_mpi_init(NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{
@@ -121,8 +121,6 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-        //MPI_Finalize();
-
 	if (rank == last_rank)
 	{
                 FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);

+ 1 - 0
src/Makefile.am

@@ -198,6 +198,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	util/execute_on_all.c					\
 	util/starpu_create_sync_task.c				\
 	util/file.c						\
+	util/misc.c						\
 	util/starpu_data_cpy.c					\
 	util/starpu_insert_task.c				\
 	util/starpu_insert_task_utils.c				\

+ 4 - 3
src/common/fxt.h

@@ -29,6 +29,7 @@
 #include <sys/types.h>
 #include <stdlib.h>
 #include <common/config.h>
+#include <common/utils.h>
 #include <starpu.h>
 
 /* some key to identify the worker kind */
@@ -201,7 +202,7 @@ do {									\
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
-        const char *model_name = _starpu_get_model_name((job));         \
+        const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
@@ -261,7 +262,7 @@ do {									\
 #define _STARPU_TRACE_TASK_DONE(job)						\
 do {										\
 	unsigned exclude_from_dag = (job)->exclude_from_dag;			\
-        const char *model_name = _starpu_get_model_name((job));                       \
+        const char *model_name = _starpu_job_get_model_name((job));                       \
 	if (model_name)					                        \
 	{									\
 		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
@@ -274,7 +275,7 @@ do {										\
 #define _STARPU_TRACE_TAG_DONE(tag)						\
 do {										\
         struct _starpu_job *job = (tag)->job;                                  \
-        const char *model_name = _starpu_get_model_name((job));                       \
+        const char *model_name = _starpu_job_get_model_name((job));                       \
 	if (model_name)                                                         \
 	{									\
           _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \

+ 192 - 27
src/common/utils.h

@@ -43,7 +43,7 @@
 #  define _STARPU_LOG_OUT_TAG(outtag)
 #endif
 
-#define _STARPU_DISP(fmt, args ...) fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args)
+#define _STARPU_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); }} while(0)
 #define _STARPU_ERROR(fmt, args ...)                                                  \
 	do {                                                                          \
                 fprintf(stderr, "[starpu][%s] Error: " fmt ,__func__ ,##args);        \
@@ -61,6 +61,14 @@ char *_starpu_get_home_path(void);
 /* If FILE is currently on a comment line, eat it.  */
 void _starpu_drop_comments(FILE *f);
 
+struct _starpu_job;
+/* Returns the symbol associated to that job if any. */
+const char *_starpu_job_get_model_name(struct _starpu_job *j);
+
+struct starpu_codelet;
+/* Returns the symbol associated to that job if any. */
+const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl);
+
 #define _STARPU_PTHREAD_CREATE(thread, attr, routine, arg) do {                \
 	int p_ret = pthread_create((thread), (attr), (routine), (arg));	       \
 	if (STARPU_UNLIKELY(p_ret != 0)) {                                     \
@@ -70,7 +78,10 @@ void _starpu_drop_comments(FILE *f);
 	}                                                                      \
 } while (0)
 
-#define _STARPU_PTHREAD_MUTEX_INIT(mutex, attr) {                              \
+/*
+ * Encapsulation of the pthread_mutex_* functions.
+ */
+#define _STARPU_PTHREAD_MUTEX_INIT(mutex, attr) do {                           \
 	int p_ret = pthread_mutex_init((mutex), (attr));                       \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
@@ -78,8 +89,9 @@ void _starpu_drop_comments(FILE *f);
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 	}                                                                      \
-}
-#define _STARPU_PTHREAD_MUTEX_DESTROY(mutex) {                                 \
+} while (0)
+
+#define _STARPU_PTHREAD_MUTEX_DESTROY(mutex) do {                              \
 	int p_ret = pthread_mutex_destroy(mutex);                              \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
@@ -87,45 +99,198 @@ void _starpu_drop_comments(FILE *f);
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 	}                                                                      \
-}
-#define _STARPU_PTHREAD_MUTEX_LOCK(mutex) {                                    \
+} while(0)
+
+#define _STARPU_PTHREAD_MUTEX_LOCK(mutex) do {                                 \
 	int p_ret = pthread_mutex_lock(mutex);                                 \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
-			"%s:%d pthread_mutex_lock : %s\n",                     \
+			"%s:%d pthread_mutex_lock: %s\n",                      \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 	}                                                                      \
-}
+} while (0)
 
-#define _STARPU_PTHREAD_MUTEX_UNLOCK(mutex) {                                  \
+#define _STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
 	int p_ret = pthread_mutex_unlock(mutex);                               \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
-			"%s:%d pthread_mutex_unlock : %s\n",                   \
+			"%s:%d pthread_mutex_unlock: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_rwlock_* functions.
+ */
+#define _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) do {                         \
+	int p_ret = pthread_rwlock_init((rwlock), (attr));                     \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_init: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_RDLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_rdlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_rdlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_wrlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_wrlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) do {                             \
+	int p_ret = pthread_rwlock_unlock(rwlock);                             \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_unlock: %s\n",                   \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_RWLOCK_DESTROY(rwlock) do {                            \
+	int p_ret = pthread_rwlock_destroy(rwlock);                            \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_rwlock_destroy: %s\n",                  \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_cond_* functions.
+ */
+#define _STARPU_PTHREAD_COND_INIT(cond, attr) do {                             \
+	int p_ret = pthread_cond_init((cond), (attr));                         \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_init: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_DESTROY(cond) do {                                \
+	int p_ret = pthread_cond_destroy(cond);                                \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_destroy: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+			STARPU_ABORT();                                        \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_SIGNAL(cond) do {                                 \
+	int p_ret = pthread_cond_signal(cond);                                 \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_signal: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_BROADCAST(cond) do {                              \
+	int p_ret = pthread_cond_broadcast(cond);                              \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_broadcast: %s\n",                  \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_COND_WAIT(cond, mutex) do {                            \
+	int p_ret = pthread_cond_wait((cond), (mutex));                        \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_cond_wait: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+/*
+ * Encapsulation of the pthread_barrier_* functions.
+ */
+#define _STARPU_PTHREAD_BARRIER_INIT(barrier, attr, count) do {                \
+	int p_ret = pthread_barrier_init((barrier), (attr), (count));          \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_init: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
+
+#define _STARPU_PTHREAD_BARRIER_DESTROY(barrier) do {                          \
+	int p_ret = pthread_barrier_destroy((barrier));                        \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_destroy: %s\n",                 \
 			__FILE__, __LINE__, strerror(p_ret));                  \
 		STARPU_ABORT();                                                \
 	}                                                                      \
-}
+} while (0)
 
-#define _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) { int p_ret = pthread_rwlock_init((rwlock), (attr)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_RDLOCK(rwlock) { int p_ret = pthread_rwlock_rdlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_rdlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) { int p_ret = pthread_rwlock_wrlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_wrlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) { int p_ret = pthread_rwlock_unlock(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_unlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_RWLOCK_DESTROY(rwlock) { int p_ret = pthread_rwlock_destroy(rwlock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_rwlock_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             \
+	int p_ret = pthread_barrier_wait(barrier);                             \
+	if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_barrier_wait: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+			STARPU_ABORT();                                        \
+	}                                                                      \
+} while (0)
 
-#define _STARPU_PTHREAD_COND_INIT(cond, attr) { int p_ret = pthread_cond_init((cond), (attr)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_DESTROY(cond) { int p_ret = pthread_cond_destroy(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_SIGNAL(cond) { int p_ret = pthread_cond_signal(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_signal : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_BROADCAST(cond) { int p_ret = pthread_cond_broadcast(cond); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_broadcast : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_COND_WAIT(cond, mutex) { int p_ret = pthread_cond_wait((cond), (mutex)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_cond_wait : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+/*
+ * Encapsulation of the pthread_spin_* functions.
+ */
+#define _STARPU_PTHREAD_SPIN_DESTROY(lock) do {                                \
+	int p_ret = pthread_spin_destroy(lock);                                \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_destroy: %s\n",                    \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
-#define _STARPU_PTHREAD_BARRIER_INIT(barrier, attr, count) { int p_ret = pthread_barrier_init((barrier), (attr), (count)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_barrier_init : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_BARRIER_DESTROY(barrier) { int p_ret = pthread_barrier_destroy((barrier)); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_barrier_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_BARRIER_WAIT(barrier) { int p_ret = pthread_barrier_wait(barrier); if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { fprintf(stderr, "pthread_barrier_wait : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_SPIN_LOCK(lock) do {                                   \
+	int p_ret = pthread_spin_lock(lock);                                   \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_lock: %s\n",                       \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
-#define _STARPU_PTHREAD_SPIN_DESTROY(lock) { int p_ret = pthread_spin_destroy(lock); if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_destroy : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_SPIN_LOCK(lock) { int p_ret = pthread_spin_lock(lock);  if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_lock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
-#define _STARPU_PTHREAD_SPIN_UNLOCK(lock) { int p_ret = pthread_spin_unlock(lock);  if (STARPU_UNLIKELY(p_ret)) { fprintf(stderr, "pthread_spin_unlock : %s\n", strerror(p_ret)); STARPU_ABORT();}}
+#define _STARPU_PTHREAD_SPIN_UNLOCK(lock) do {                                 \
+	int p_ret = pthread_spin_unlock(lock);                                 \
+	if (STARPU_UNLIKELY(p_ret)) {                                          \
+		fprintf(stderr,                                                \
+			"%s:%d pthread_spin_unlock: %s\n",                     \
+			__FILE__, __LINE__, strerror(p_ret));                  \
+		STARPU_ABORT();                                                \
+	}                                                                      \
+} while (0)
 
 #endif // __COMMON_UTILS_H__

+ 0 - 43
src/core/jobs.c

@@ -29,28 +29,6 @@
 #include <starpu_top.h>
 #include <top/starpu_top_core.h>
 
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
-{
-	struct starpu_task *task = j->task;
-
-	if (model && model->per_arch[arch][nimpl].size_base) {
-		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
-	} else if (model && model->size_base) {
-		return model->size_base(task, nimpl);
-	} else {
-		unsigned nbuffers = task->cl->nbuffers;
-		size_t size = 0;
-
-		unsigned buffer;
-		for (buffer = 0; buffer < nbuffers; buffer++)
-		{
-			starpu_data_handle_t handle = task->handles[buffer];
-			size += _starpu_data_get_size(handle);
-		}
-		return size;
-	}
-}
-
 /* we need to identify each task to generate the DAG. */
 static unsigned job_cnt = 0;
 
@@ -420,24 +398,3 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
 	return 0;
 }
-
-const char *_starpu_get_model_name(struct _starpu_job *j)
-{
-	if (!j)
-		return NULL;
-
-	struct starpu_task *task = j->task;
-        if (task && task->cl) {
-            if (task->cl->model && task->cl->model->symbol)
-                return task->cl->model->symbol;
-	    else
-		return task->cl->name;
-	} else
-	{
-#ifdef STARPU_USE_FXT
-                return j->model_name;
-#else
-                return NULL;
-#endif
-        }
-}

+ 0 - 3
src/core/jobs.h

@@ -171,7 +171,4 @@ struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker);
  * enforce a FIFO ordering. */
 int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 
-/* Returns the symbol associated to that job if any. */
-const char *_starpu_get_model_name(struct _starpu_job *j);
-
 #endif // __JOBS_H__

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -1010,7 +1010,7 @@ static int load_bus_bandwidth_file_content(void)
 			n = fscanf(f, "%lf", &bandwidth);
 			if (n != 1)
 			{
-				fprintf(stderr,"Error while reading sampling file <%s>. Expected a number\n", path);
+				_STARPU_DISP("Error while reading sampling file <%s>. Expected a number\n", path);
 				fclose(f);
 				return 0;
 			}
@@ -1267,21 +1267,21 @@ static void check_bus_config_file(void)
                 // Checking if both configurations match
                 if (read_cpus != ncpus)
 		{
-			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
+			_STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
                 else if (read_cuda != ncuda)
 		{
-                        fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
+                        _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
                 else if (read_opencl != nopencl)
 		{
-                        fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
+                        _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
                         starpu_force_bus_sampling();
-			fprintf(stderr, "done\n");
+			_STARPU_DISP("... done\n");
                 }
         }
 }

+ 26 - 1
src/core/perfmodel/perfmodel_history.c

@@ -53,6 +53,28 @@ struct starpu_perfmodel_history_table
 static pthread_rwlock_t registered_models_rwlock;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
+{
+	struct starpu_task *task = j->task;
+
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+	} else if (model && model->size_base) {
+		return model->size_base(task, nimpl);
+	} else {
+		unsigned nbuffers = task->cl->nbuffers;
+		size_t size = 0;
+
+		unsigned buffer;
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+			size += _starpu_data_get_size(handle);
+		}
+		return size;
+	}
+}
+
 /*
  * History based model
  */
@@ -1067,7 +1089,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			exp = entry->history_entry->mean;
 		else if (!model->benchmarking)
 		{
-			_STARPU_DISP("Warning: model %s is not calibrated enough, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol);
+			char archname[32];
+
+			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}

+ 1 - 2
src/core/sched_policy.c

@@ -426,8 +426,7 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 		}
 #endif
 		default:
-			fprintf(stderr, "Oops : %u\n", handle->mf_node);
-			STARPU_ABORT();
+			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 		}
 		break;
 #ifdef STARPU_USE_CUDA

+ 29 - 4
src/core/task.c

@@ -262,7 +262,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* CPU */
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS && cl->cpu_funcs[0])
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both cpu_func and cpu_funcs are set. Ignoring cpu_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both cpu_func and cpu_funcs are set. Ignoring cpu_func.\n");
 		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
 	}
 	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS)
@@ -282,7 +282,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* CUDA */
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS && cl->cuda_funcs[0])
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both cuda_func and cuda_funcs are set. Ignoring cuda_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both cuda_func and cuda_funcs are set. Ignoring cuda_func.\n");
 		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
 	}
 	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS)
@@ -302,7 +302,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	/* OpenCL */
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS && cl->opencl_funcs[0])
 	{
-		fprintf(stderr, "[warning] [struct starpu_codelet] both opencl_func and opencl_funcs are set. Ignoring opencl_func.\n");
+		_STARPU_DISP("[warning] [struct starpu_codelet] both opencl_func and opencl_funcs are set. Ignoring opencl_func.\n");
 		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
 	}
 	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS)
@@ -344,7 +344,7 @@ void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 		{
 			if (task->buffers[i].handle && task->handles[i])
 			{
-				fprintf(stderr, "[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
+				_STARPU_DISP("[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
 				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
 				STARPU_ABORT();
 			}
@@ -370,6 +370,7 @@ int starpu_task_submit(struct starpu_task *task)
 //	   0 : starpu_get_sched_ctx());
 	int ret;
 	unsigned is_sync = task->synchronous;
+	starpu_task_bundle_t bundle = task->bundle;
         _STARPU_LOG_IN();
 
 	if (is_sync)
@@ -449,6 +450,30 @@ int starpu_task_submit(struct starpu_task *task)
 		}
 	}
 
+	if (bundle)
+	{
+		/* We need to make sure that models for other tasks of the
+		 * bundle are also loaded, so the scheduler can estimate the
+		 * duration of the whole bundle */
+		_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
+
+		struct _starpu_task_bundle_entry *entry;
+		entry = bundle->list;
+
+		while (entry)
+		{
+			if (entry->task->cl->model && entry->task->cl->model->symbol)
+				_starpu_load_perfmodel(entry->task->cl->model);
+
+			if (entry->task->cl->power_model && entry->task->cl->power_model->symbol)
+				_starpu_load_perfmodel(entry->task->cl->power_model);
+
+			entry = entry->next;
+		}
+
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+	}
+
 	/* If profiling is activated, we allocate a structure to store the
 	 * appropriate info. */
 	struct starpu_task_profiling_info *info;

+ 7 - 37
src/core/topology.c

@@ -471,10 +471,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			{
 				/* The user requires more CUDA devices than
 				 * there is available */
-				fprintf(stderr,
-					"# Warning: %d CUDA devices "
-					"requested. Only %d available.\n",
-					ncuda, nb_devices);
+				_STARPU_DISP("Warning: %d CUDA devices requested. Only %d available.\n", ncuda, nb_devices);
 				ncuda = nb_devices;
 			}
 		}
@@ -527,14 +524,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			nopencl = nb_devices;
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			{
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"available. Only %d enabled. "
-					"Use configure option "
-					"--enable-maxopencldadev=xxx to "
-					"update the maximum value of "
-					"supported OpenCL devices.\n",
-					nb_devices, STARPU_MAXOPENCLDEVS);
+				_STARPU_DISP("Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldadev=xxx to update the maximum value of supported OpenCL devices.\n", nb_devices, STARPU_MAXOPENCLDEVS);
 				nopencl = STARPU_MAXOPENCLDEVS;
 			}
 		}
@@ -545,23 +535,13 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			{
 				/* The user requires more OpenCL devices than
 				 * there is available */
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"requested. Only %d available.\n",
-					nopencl, nb_devices);
+				_STARPU_DISP("Warning: %d OpenCL devices requested. Only %d available.\n", nopencl, nb_devices);
 				nopencl = nb_devices;
 			}
 			/* Let's make sure this value is OK. */
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			{
-				fprintf(stderr,
-					"# Warning: %d OpenCL devices "
-					"requested. Only %d enabled. Use "
-					"configure option "
-					"--enable-maxopencldev=xxx to update "
-					"the maximum value of supported "
-					"OpenCL devices.\n",
-					nopencl, STARPU_MAXOPENCLDEVS);
+				_STARPU_DISP("Warning: %d OpenCL devices requested. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices.\n", nopencl, STARPU_MAXOPENCLDEVS);
 				nopencl = STARPU_MAXOPENCLDEVS;
 			}
 		}
@@ -610,10 +590,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 			STARPU_ASSERT(ngordon <= NMAXGORDONSPUS);
 			if (ngordon > STARPU_MAXGORDONSPUS);
 			{
-				fprintf(stderr,
-					"# Warning: %d Gordon CPUs devices "
-					"requested. Only %d supported\n",
-					ngordon, NMAXGORDONSPUS);
+				_STARPU_DISP("Warning: %d Gordon CPUs devices requested. Only %d supported\n", ngordon, NMAXGORDONSPUS);
 				ngordon = NMAXGORDONSPUS;
 			}
 		}
@@ -658,13 +635,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		{
 			if (ncpu > STARPU_MAXCPUS)
 			{
-				fprintf(stderr,
-					"# Warning: %d CPU devices requested."
-					" Only %d enabled. Use configure "
-					"option --enable-maxcpus=xxx to "
-					"update the maximum value of "
-					"supported CPU devices.\n",
-					ncpu, STARPU_MAXCPUS);
+				_STARPU_DISP("Warning: %d CPU devices requested. Only %d enabled. Use configure option --enable-maxcpus=xxx to update the maximum value of supported CPU devices.\n", ncpu, STARPU_MAXCPUS);
 				ncpu = STARPU_MAXCPUS;
 			}
 		}
@@ -757,8 +728,7 @@ _starpu_bind_thread_on_cpu (
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{
-		fprintf(stderr,"SetThreadMaskAffinity(%lx) failed\n", mask);
-		STARPU_ABORT();
+		_STARPU_ERROR("SetThreadMaskAffinity(%lx) failed\n", mask);
 	}
 #else
 #warning no CPU binding support

+ 9 - 9
src/core/workers.c

@@ -652,31 +652,31 @@ int starpu_init(struct starpu_conf *user_conf)
 
 #ifdef __GNUC__
 #ifndef __OPTIMIZE__
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
 #endif
 #endif
 #if 0
 #ifndef STARPU_NO_ASSERT
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured without --enable-fast\n");
+	_STARPU_DISP("Warning: StarPU was configured without --enable-fast\n");
 #endif
 #endif
 #ifdef STARPU_MEMORY_STATUS
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-memory-status, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-memory-status, which slows down a bit\n");
 #endif
 #ifdef STARPU_VERBOSE
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-verbose, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-verbose, which slows down a bit\n");
 #endif
 #ifdef STARPU_USE_FXT
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --with-fxt, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --with-fxt, which slows down a bit\n");
 #endif
 #ifdef STARPU_PERF_DEBUG
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
 #endif
 #ifdef STARPU_MODEL_DEBUG
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-model-debug, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-model-debug, which slows down a bit\n");
 #endif
 #ifdef STARPU_DATA_STATS
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-stats, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU was configured with --enable-stats, which slows down a bit\n");
 #endif
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
@@ -725,7 +725,7 @@ int starpu_init(struct starpu_conf *user_conf)
 	else
 	{
 	     if (user_conf->magic != 42) {
-		fprintf(stderr, "starpu_conf structure needs to be initialized with starpu_conf_init\n");
+		_STARPU_DISP("starpu_conf structure needs to be initialized with starpu_conf_init\n");
 		return -EINVAL;
 	     }
 	     config.conf = user_conf;

+ 1 - 1
src/datawizard/interfaces/bcsr_interface.c

@@ -389,9 +389,9 @@ fail_colind:
 		{
 			cudaError_t err;
 			err = cudaFree((void*)addr_nzval);
-			break;
 			if (STARPU_UNLIKELY(err != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(err);
+			break;
 		}
 #endif
 #ifdef STARPU_USE_OPENCL

+ 6 - 0
src/datawizard/interfaces/coo_interface.c

@@ -368,8 +368,14 @@ allocate_coo_buffer_on_node(void *data_interface, uint32_t dst_node)
 	case STARPU_CPU_RAM:
 	{
 		addr_columns = malloc(n_values * sizeof(coo_interface->columns[0]));
+		if (STARPU_UNLIKELY(addr_columns == NULL))
+			goto fail_columns;
 		addr_rows = malloc(n_values * sizeof(coo_interface->rows[0]));
+		if (STARPU_UNLIKELY(addr_rows == NULL))
+			goto fail_rows;
 		addr_values = (uintptr_t) malloc(n_values * elemsize);
+		if (STARPU_UNLIKELY(addr_values == NULL))
+			goto fail_values;
 		break;
 	}
 #ifdef STARPU_USE_CUDA

+ 2 - 1
src/datawizard/interfaces/data_interface.c

@@ -681,9 +681,10 @@ int starpu_data_interface_get_next_id(void)
 	return _data_interface_number-1;
 }
 
-int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr)
+int starpu_handle_pack_data(starpu_data_handle_t handle, void **ptr, size_t *count)
 {
 	STARPU_ASSERT(handle->ops->pack_data);
+	*count = starpu_handle_get_size(handle);
 	return handle->ops->pack_data(handle, _starpu_get_local_memory_node(), ptr);
 }
 

+ 1 - 1
src/drivers/driver_common/driver_common.c

@@ -78,7 +78,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
 	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
 
-	if (cl->model && cl->model->benchmarking)
+	if (cl && cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
 	if (rank == 0)

+ 24 - 10
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -175,7 +175,8 @@ static int good_job(struct _starpu_job *j)
 	if (!j->task->cl->model)
 		return 0;
 	/* Only support history based */
-	if (j->task->cl->model->type != STARPU_HISTORY_BASED)
+	if (j->task->cl->model->type != STARPU_HISTORY_BASED
+	 && j->task->cl->model->type != STARPU_NL_REGRESSION_BASED)
 		return 0;
 	return 1;
 }
@@ -400,7 +401,7 @@ void starpu_bound_print_dot(FILE *output)
 	fprintf(output, "strict digraph bounddeps {\n");
 	for (t = tasks; t; t = t->next)
 	{
-		fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, t->cl->name);
+		fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, _starpu_codelet_get_model_name(t->cl));
 		for (i = 0; i < t->depsn; i++)
 			fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i]->id, t->id);
 	}
@@ -433,6 +434,11 @@ void starpu_bound_print_lp(FILE *output)
 		nt = 0;
 		for (t1 = tasks; t1; t1 = t1->next)
 		{
+			if (t1->cl->model->type != STARPU_HISTORY_BASED &&
+			    t1->cl->model->type != STARPU_NL_REGRESSION_BASED)
+				/* TODO: */
+				fprintf(stderr, "Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
+
 			struct _starpu_job j =
 			{
 				.footprint = t1->footprint,
@@ -478,7 +484,7 @@ void starpu_bound_print_lp(FILE *output)
 		fprintf(output, "/* According to where the task is indeed executed */\n");
 		for (t1 = tasks; t1; t1 = t1->next)
 		{
-			fprintf(output, "/* %s %x */\tc%lu = s%lu", t1->cl->name, (unsigned) t1->footprint, t1->id, t1->id);
+			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			{
 				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
@@ -673,11 +679,19 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* And we have to have computed exactly all tasks */\n");
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 			{
-				fprintf(output, "/* task %s key %x */\n0", tp->cl->name, (unsigned) tp->footprint);
-				for (w = 0; w < nw; w++)
-					if (!isnan(times[w*nt+t]))
+				int got_one = 0;
+				fprintf(output, "/* task %s key %x */\n0", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
+				for (w = 0; w < nw; w++) {
+					if (isnan(times[w*nt+t]))
+						fprintf(stderr, "Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
+					else {
+						got_one = 1;
 						fprintf(output, "\t+w%dt%dn", w, t);
+					}
+				}
 				fprintf(output, " = %lu;\n", tp->n);
+				if (!got_one)
+					fprintf(stderr, "Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
 				/* Show actual values */
 				fprintf(output, "/*");
 				for (w = 0; w < nw; w++)
@@ -750,7 +764,7 @@ void starpu_bound_print_mps(FILE *output)
 		fprintf(output, "\n* And we have to have computed exactly all tasks\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		{
-			fprintf(output, "* task %s key %x\n", tp->cl->name, (unsigned) tp->footprint);
+			fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			fprintf(output, " E  T%d\n", t);
 		}
 
@@ -881,7 +895,7 @@ static glp_prob *_starpu_bound_glp_resolve(int integer)
 		{
 			char name[32], title[64];
 			starpu_worker_get_name(w, name, sizeof(name));
-			snprintf(title, sizeof(title), "task %s key %x", tp->cl->name, (unsigned) tp->footprint);
+			snprintf(title, sizeof(title), "task %s key %x", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			glp_set_row_name(lp, nw+t+1, title);
 			for (w = 0; w < nw; w++)
 			{
@@ -949,7 +963,7 @@ void starpu_bound_print(FILE *output, int integer __attribute__ ((unused)))
 
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
 		{
-			fprintf(output, "%s key %x\n", tp->cl->name, (unsigned) tp->footprint);
+			fprintf(output, "%s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
 			for (w = 0; w < nw; w++)
 				if (integer)
 					fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));

+ 48 - 0
src/util/misc.c

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/utils.h>
+#include <core/jobs.h>
+
+const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl)
+{
+	if (!cl)
+		return NULL;
+
+	if (cl->model && cl->model->symbol && cl->model->symbol[0])
+		return cl->model->symbol;
+	else
+		return cl->name;
+}
+
+const char *_starpu_job_get_model_name(struct _starpu_job *j)
+{
+	const char *ret = NULL;
+
+	if (!j)
+		return NULL;
+
+	struct starpu_task *task = j->task;
+	if (task)
+		ret = _starpu_codelet_get_model_name(task->cl);
+
+#ifdef STARPU_USE_FXT
+	if (!ret)
+                ret = j->model_name;
+#endif
+        return ret;
+}

+ 1 - 2
starpu-1.0.pc.in

@@ -33,5 +33,4 @@ Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@
 @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@
 @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
-Requires: @HWLOC_REQUIRES@
-#Requires.private: @GORDON_REQUIRES@
+Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpufft/starpufft-double.h

@@ -15,10 +15,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <complex.h>
 #include <starpu_config.h>
 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
+#include <complex.h>
 #include <fftw3.h>
 #endif
 

+ 1 - 1
starpufft/starpufft-float.h

@@ -15,10 +15,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <complex.h>
 #include <starpu_config.h>
 
 #if defined(STARPU_HAVE_FFTW) && !defined(__CUDACC__) 
+#include <complex.h>
 #include <fftw3.h>
 #endif
 

+ 2 - 0
tests/microbenchs/tasks_size_overhead.gp

@@ -16,6 +16,8 @@ gnuplot << EOF
 set terminal eps
 set output "tasks_size_overhead.eps"
 set key top left
+set xlabel "number of cores"
+set ylabel "speedup"
 plot \
 	"$OUTPUT" using 1:($VAL1)/(\$3) with linespoints title columnheader(2), \
 	"$OUTPUT" using 1:($VAL2)/(\$5) with linespoints title columnheader(4), \