Browse Source

new merge with trunk, heft & dmda important changes

Andra Hugo 13 years ago
parent
commit
cab880bfd5
100 changed files with 3267 additions and 1420 deletions
  1. 3 1
      .gitignore
  2. 9 11
      AUTHORS
  3. 6 1
      ChangeLog
  4. 24 0
      Makefile.am
  5. 5 3
      README
  6. 159 0
      build-aux/pmccabe.css
  7. 907 0
      build-aux/pmccabe2html
  8. 81 32
      configure.ac
  9. 2 1
      doc/Makefile.am
  10. 7 8
      doc/chapters/advanced-api.texi
  11. 18 10
      doc/chapters/advanced-examples.texi
  12. 42 14
      doc/chapters/basic-api.texi
  13. 16 16
      doc/chapters/basic-examples.texi
  14. 1 1
      doc/chapters/benchmarks.texi
  15. 234 109
      doc/chapters/configuration.texi
  16. 0 1
      doc/chapters/fdl-1.3.texi
  17. 10 10
      doc/chapters/installing.texi
  18. 2 2
      doc/chapters/introduction.texi
  19. 39 10
      doc/chapters/mpi-support.texi
  20. 131 5
      doc/chapters/perf-feedback.texi
  21. 83 5
      doc/chapters/perf-optimization.texi
  22. 6 7
      doc/chapters/scaling-vector-example.texi
  23. 2 3
      doc/chapters/using.texi
  24. 1 1
      doc/chapters/vector_scal_cpu.texi
  25. 7 2
      doc/starpu.texi
  26. 2 3
      examples/basic_examples/vector_scal.c
  27. 1 1
      examples/basic_examples/vector_scal_c.c
  28. 1 1
      examples/basic_examples/vector_scal_cpu.c
  29. 1 1
      examples/basic_examples/vector_scal_cpu_icc.icc
  30. 1 1
      examples/basic_examples/vector_scal_cpu_template.h
  31. 1 1
      examples/basic_examples/vector_scal_cuda.cu
  32. 1 1
      examples/basic_examples/vector_scal_opencl.c
  33. 16 4
      examples/cholesky/cholesky.h
  34. 10 6
      examples/cholesky/cholesky_grain_tag.c
  35. 18 10
      examples/cholesky/cholesky_implicit.c
  36. 10 6
      examples/cholesky/cholesky_tag.c
  37. 8 6
      examples/cholesky/cholesky_tile_tag.c
  38. 37 107
      examples/filters/custom_mf/custom_interface.c
  39. 34 20
      examples/interface/complex.c
  40. 10 6
      examples/interface/complex_codelet.h
  41. 21 73
      examples/interface/complex_interface.c
  42. 19 11
      examples/matvecmult/matvecmult.c
  43. 7 5
      examples/mult/xgemm.c
  44. 1 1
      examples/openmp/vector_scal.c
  45. 1 1
      examples/spmd/vector_scal_spmd.c
  46. 5 3
      examples/stencil/stencil.c
  47. 6 2
      include/starpu.h
  48. 4 0
      include/starpu_config.h.in
  49. 1 0
      include/starpu_data.h
  50. 5 0
      include/starpu_data_interfaces.h
  51. 1 1
      include/starpu_perfmodel.h
  52. 5 5
      include/starpu_scheduler.h
  53. 3 3
      include/starpu_util.h
  54. 21 9
      mpi/examples/Makefile.am
  55. 14 14
      mpi/examples/cholesky/mpi_cholesky.c
  56. 7 7
      mpi/examples/cholesky/mpi_cholesky.h
  57. 2 2
      mpi/examples/cholesky/mpi_cholesky_codelets.c
  58. 13 13
      mpi/examples/cholesky/mpi_cholesky_distributed.c
  59. 1 1
      mpi/examples/cholesky/mpi_cholesky_kernels.c
  60. 44 10
      mpi/examples/complex/mpi_complex.c
  61. 13 13
      mpi/examples/mpi_lu/plu_example.c
  62. 54 55
      mpi/examples/mpi_lu/plu_solve.c
  63. 1 1
      mpi/examples/mpi_lu/pxlu.c
  64. 12 14
      mpi/examples/mpi_lu/pxlu_kernels.c
  65. 61 61
      mpi/examples/stencil/stencil5.c
  66. 5 1
      mpi/include/starpu_mpi.h
  67. 181 152
      mpi/src/starpu_mpi.c
  68. 5 7
      mpi/src/starpu_mpi_collective.c
  69. 65 17
      mpi/src/starpu_mpi_datatype.c
  70. 2 1
      mpi/src/starpu_mpi_datatype.h
  71. 118 76
      mpi/src/starpu_mpi_insert_task.c
  72. 2 2
      mpi/src/starpu_mpi_insert_task.h
  73. 3 3
      mpi/src/starpu_mpi_private.h
  74. 32 9
      mpi/tests/Makefile.am
  75. 6 1
      mpi/tests/block_interface.c
  76. 7 1
      mpi/tests/block_interface_pinned.c
  77. 60 58
      mpi/tests/insert_task.c
  78. 72 70
      mpi/tests/insert_task_block.c
  79. 18 13
      mpi/tests/insert_task_cache.c
  80. 52 49
      mpi/tests/insert_task_owner.c
  81. 48 47
      mpi/tests/insert_task_owner2.c
  82. 26 21
      mpi/tests/insert_task_owner_data.c
  83. 6 2
      mpi/tests/mpi_detached_tag.c
  84. 6 2
      mpi/tests/mpi_irecv.c
  85. 9 5
      mpi/tests/mpi_irecv_detached.c
  86. 6 2
      mpi/tests/mpi_isend.c
  87. 21 13
      mpi/tests/mpi_isend_detached.c
  88. 12 12
      mpi/tests/mpi_reduction.c
  89. 15 15
      mpi/tests/mpi_scatter_gather.c
  90. 8 3
      mpi/tests/mpi_test.c
  91. 41 36
      mpi/tests/multiple_send.c
  92. 10 3
      mpi/tests/pingpong.c
  93. 2 1
      mpi/tests/ring.c
  94. 2 1
      mpi/tests/ring_async.c
  95. 4 3
      mpi/tests/ring_async_implicit.c
  96. 97 0
      mpi/tests/user_defined_datatype.c
  97. 1 0
      socl/examples/Makefile.am
  98. 3 3
      socl/examples/clinfo/clinfo.c
  99. 54 59
      socl/src/cl_createbuffer.c
  100. 0 0
      socl/src/cl_enqueuendrangekernel.c

+ 3 - 1
.gitignore

@@ -4,6 +4,7 @@
 /autom4te.cache
 /autom4te.cache
 /libtool
 /libtool
 /aclocal.m4
 /aclocal.m4
+/build
 /build-aux
 /build-aux
 /GPATH
 /GPATH
 /GRTAGS
 /GRTAGS
@@ -28,7 +29,7 @@ starpu.log
 /tests/datawizard/handle_to_pointer
 /tests/datawizard/handle_to_pointer
 /tests/datawizard/data_lookup
 /tests/datawizard/data_lookup
 /doc/stamp-vti
 /doc/stamp-vti
-/doc/version.texi
+/doc/chapters/version.texi
 /examples/basic_examples/block
 /examples/basic_examples/block
 /examples/basic_examples/hello_world
 /examples/basic_examples/hello_world
 /examples/basic_examples/mult
 /examples/basic_examples/mult
@@ -289,3 +290,4 @@ starpu.log
 /gcc-plugin/tests/opencl
 /gcc-plugin/tests/opencl
 /gcc-plugin/tests/registered
 /gcc-plugin/tests/registered
 /gcc-plugin/tests/warn-unregistered
 /gcc-plugin/tests/warn-unregistered
+/cyclomatic-complexity.html

+ 9 - 11
AUTHORS

@@ -1,19 +1,17 @@
 Cédric Augonnet <cedric.augonnet@inria.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
-Nicolas Collin <nicolas.collin@inria.fr>
+William Braik <wbraik@gmail.com>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
+Yann Courtois <yann.courtois33@gmail.com>
+Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Nathalie Furmento <nathalie.furmento@labri.fr>
 Nathalie Furmento <nathalie.furmento@labri.fr>
+David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
 Sylvain Henry <sylvain.henry@inria.fr>
+Mehdi Juhoor <mjuhoor@gmail.com>
+Antoine Lucas <antoine.lucas.33@gmail.com>
+Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Cyril Roelandt <cyril.roelandt@inria.fr>
+Anthony Roy <theanthony33@gmail.com>
 François Tessier <francois.tessier@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
-Pierre André Wacrenier <wacrenier@labri.fr>
-William Braik <wbraik@gmail.com>
-Yann Courtois <yann.courtois33@gmail.com>
-Jean-Marie Couteyen <jm.couteyen@gmail.com>
-Mehdi Juhoor <mjuhoor@gmail.com>
-Anthony Roy <theanthony33@gmail.com>
-David Gómez <david_gomez1380@yahoo.com.mx>
-Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
-Antoine Lucas <antoine.lucas.33@gmail.com>
-
+Pierre-André Wacrenier <wacrenier@labri.fr>

+ 6 - 1
ChangeLog

@@ -56,6 +56,11 @@ New features:
         - When exchanging user-defined data interfaces, the size of
         - When exchanging user-defined data interfaces, the size of
 	  the data is the size returned by the pack operation, i.e
 	  the data is the size returned by the pack operation, i.e
 	  data with dynamic size can now be exchanged with StarPU-MPI.
 	  data with dynamic size can now be exchanged with StarPU-MPI.
+  * Add experimental simgrid support, to simulate execution with various
+    number of CPUs, GPUs, amount of memory, etc.
+  * Add support for OpenCL simulators (which provide simulated execution time)
+  * Add support for Temanejo, a task graph debugger
+  * Theoretical bound lp output now includes data transfer time.
 
 
 Changes:
 Changes:
   * Fix the block filter functions.
   * Fix the block filter functions.
@@ -80,6 +85,7 @@ Changes:
   * Cell:
   * Cell:
     - It is no longer possible to enable the cell support via the
     - It is no longer possible to enable the cell support via the
       gordon driver
       gordon driver
+  * Fix data transfer arrows in paje traces
 
 
 Small changes:
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
@@ -91,7 +97,6 @@ Small changes:
   * Fix forcing calibration of never-calibrated archs.
   * Fix forcing calibration of never-calibrated archs.
   * CUDA applications are no longer compiled with the "-arch sm_13"
   * CUDA applications are no longer compiled with the "-arch sm_13"
     option. It is specifically added to applications which need it.
     option. It is specifically added to applications which need it.
-  * Documentation is not built if necessary tools are missing
 
 
 StarPU 1.0.3 (svn revision 7379)
 StarPU 1.0.3 (svn revision 7379)
 ==============================================
 ==============================================

+ 24 - 0
Makefile.am

@@ -124,3 +124,27 @@ showcheck:
 
 
 ctags-local:
 ctags-local:
 	$(CTAGS) -R -I LIST_TYPE
 	$(CTAGS) -R -I LIST_TYPE
+
+
+# Cyclomatic complexity reports.
+
+# The pmccabe tool, see <http://www.parisc-linux.org/~bame/pmccabe/>.
+PMCCABE = pmccabe
+
+VC_URL = "https://gforge.inria.fr/scm/viewvc.php/trunk/%FILENAME%?view=markup&root=starpu"
+
+# Generate a cyclomatic complexity report.  Note that examples and tests are
+# excluded because they're not particularly relevant, and more importantly
+# they all have a function called `main', which clobbers the report.
+cyclomatic-complexity.html:
+	$(PMCCABE)								\
+	  `find \( -name examples -o -name tests -o -path ./tools/dev/experimental \) -prune -o -name \*.c` \
+	  | sort -nr								\
+	  | $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html			\
+		   -v lang=html -v name="$(PACKAGE_NAME)"			\
+		   -v vcurl=$(VC_URL)						\
+		   -v url="$(PACKAGE_URL)"					\
+		   -v css=${top_srcdir}/build-aux/pmccabe.css			\
+		   -v cut_dir=${top_srcdir}/					\
+		   > $@-tmp
+	mv $@-tmp $@

+ 5 - 3
README

@@ -31,7 +31,7 @@ executed as efficiently as possible.
 +------------------------
 +------------------------
 | I.b. What StarPU is not
 | I.b. What StarPU is not
 
 
-StarPU is not a new language, and it does not extends existing languages either.
+StarPU is not a new language, and it does not extend existing languages either.
 StarPU does not help to write computation kernels.
 StarPU does not help to write computation kernels.
 
 
 +---------------------------------
 +---------------------------------
@@ -76,11 +76,13 @@ advantage of their specificities in a portable fashion.
    units according to the machine topology. For more details on hwloc, see
    units according to the machine topology. For more details on hwloc, see
    http://www.open-mpi.org/projects/hwloc/ .
    http://www.open-mpi.org/projects/hwloc/ .
 
 
- * To build the StarPU-Top tool the following are also required:
-   * libqt4 >= 4.7
+ * To build the StarPU-Top tool the following packages (along with
+   their development files) are also required:
+   * libqt4-dev >= 4.7
    * libqt4-network
    * libqt4-network
    * libqt4-opengl
    * libqt4-opengl
    * libqt4-sql
    * libqt4-sql
+   * qt4-qmake
 
 
 ++=====================++
 ++=====================++
 || III. Getting StarPU ||
 || III. Getting StarPU ||

+ 159 - 0
build-aux/pmccabe.css

@@ -0,0 +1,159 @@
+body {
+    font-family: Helvetica, sans-serif;
+}
+
+.page_title {
+    font: 18pt Georgia, serif;
+    color: darkred;
+}
+
+.section_title {
+    font: 14pt Georgia, serif;
+    color: darkred;
+}
+
+.report_timestamp {
+    color: darkred;
+    font-weight: bold;
+}
+
+.function_src {
+    text-align: left;
+    background: white;
+}
+
+.resume_table {
+}
+
+.resume_header_entry {
+    color: black;
+}
+
+.resume_number_entry {
+    color: darkred;
+    font-weight: bold;
+    text-align: right;
+}
+
+.ranges_table {
+    border-spacing: 0px;
+    border-bottom: solid 2px black;
+    border-top: solid 2px black;
+    border-left: solid 2px black;
+    border-right: solid 2px black;
+}
+
+.ranges_header_entry {
+    padding: 5px;
+    border-bottom: solid 1px black;
+    font-size: 1em;
+    font-weight: bold;
+    color: darkred;
+    text-align: left;
+}
+
+.ranges_entry {
+}
+
+.ranges_entry_simple {
+    background: #87ff75;
+}
+
+.ranges_entry_moderate {
+    background: #fffc60;
+}
+
+.ranges_entry_high {
+    background: #ff5a5d;
+}
+
+.ranges_entry_untestable {
+    background: #993300
+}
+
+
+.function_table {
+    border-spacing: 0px;
+    border-bottom: solid 2px black;
+    border-top: solid 2px black;
+    border-left: solid 2px black;
+    border-right: solid 2px black;
+}
+
+.function_table_caption {
+    font-size: 1.1em;
+    font-weight: bold;
+    color: black;
+    padding: 5px;
+}
+
+.function_table_header {
+}
+
+
+.function_table_header_entry {
+    padding: 5px;
+    border-bottom: solid 1px black;
+    font-size: 1em;
+    font-weight: bold;
+    color: darkred;
+    text-align: left;
+}
+
+.function_entry {
+}
+
+
+.function_entry_simple {
+    background: #87ff75;
+}
+
+.function_entry_moderate {
+    background: #fffc60;
+}
+
+.function_entry_high {
+    background: #ff5a5d;
+}
+
+.function_entry_untestable {
+    background: #993300
+}
+
+
+.function_entry_name {
+    font-size: 1em;
+    text-align: left;
+    font-weight: bold;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_cyclo {
+    font-size: 1em;
+    text-align: right;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_number {
+    font-size: 1em;
+    text-align: right;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_filename {
+    font-size: 1em;
+    text-align: left;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}

+ 907 - 0
build-aux/pmccabe2html

@@ -0,0 +1,907 @@
+# pmccabe2html - AWK script to convert pmccabe output to html       -*- awk -*-
+
+# Copyright (C) 2007-2012 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Written by Jose E. Marchesi <jemarch@gnu.org>.
+# Adapted for gnulib by Simon Josefsson <simon@josefsson.org>.
+# Added support for C++ by Giuseppe Scrivano <gscrivano@gnu.org>.
+
+# Typical Invocation is from a Makefile.am:
+#
+# CYCLO_SOURCES = ${top_srcdir}/src/*.[ch]
+#
+# cyclo-$(PACKAGE).html: $(CYCLO_SOURCES)
+# 	$(PMCCABE) $(CYCLO_SOURCES) \
+# 		| sort -nr \
+# 		| $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html \
+# 			-v lang=html -v name="$(PACKAGE_NAME)" \
+# 			-v vcurl="http://git.savannah.gnu.org/gitweb/?p=$(PACKAGE).git;a=blob;f=%FILENAME%;hb=HEAD" \
+# 			-v url="http://www.gnu.org/software/$(PACKAGE)/" \
+# 			-v css=${top_srcdir}/build-aux/pmccabe.css \
+# 			-v cut_dir=${top_srcdir}/ \
+# 			> $@-tmp
+# 	mv $@-tmp $@
+#
+# The variables available are:
+#   lang     output language, either 'html' or 'wiki'
+#   name     project name
+#   url      link to project's home page
+#   vcurl    URL to version controlled source code browser,
+#            a %FILENAME% in the string is replaced with the relative
+#            source filename
+#   css      CSS stylesheet filename, included verbatim in HTML output
+#   css_url  link to CSS stylesheet, an URL
+
+# Prologue & configuration
+BEGIN {
+    section_global_stats_p = 1
+    section_function_cyclo_p = 1
+
+    # "html" or "wiki"
+    package_name = name
+    output_lang = lang
+
+    # General Options
+    cyclo_simple_max = 10
+    cyclo_moderate_max = 20
+    cyclo_high_max = 50
+    source_file_link_tmpl = vcurl
+
+    # HTML options
+    if (url != "")
+    {
+	html_prolog = "<a href=\"" url "\">Back to " package_name " Homepage</a><br/><br/>"
+    }
+    html_epilog = "<hr color=\"black\" size=\"2\"/> \
+Copyright (c) 2007, 2008 Free Software Foundation, Inc."
+    html_doctype = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \
+\"http://www.w3.org/TR/html401/loose.dtd\">"
+    html_comment = "<!-- Generated by gnulib's pmccabe2html at " systime() " -->"
+    html_title = "Cyclomatic Complexity report for " package_name
+
+    # Wiki options
+    wiki_prolog = "{{Note|This page has been automatically generated}}"
+    wiki_epilog = ""
+
+    # Internal variables
+    nfuncs = 0;
+}
+
+# Functions
+
+function build_stats()
+{
+    # Maximum modified cyclo
+    for (fcn in mcyclo)
+    {
+        num_of_functions++
+        if (mcyclo[fcn] > max_mcyclo)
+        {
+            max_mcyclo = mcyclo[fcn]
+        }
+
+        if (mcyclo[fcn] > cyclo_high_max)
+        {
+            num_of_untestable_functions++
+        }
+        else if (mcyclo[fcn] > cyclo_moderate_max)
+        {
+            num_of_high_functions++
+        }
+        else if (mcyclo[fcn] > cyclo_simple_max)
+        {
+            num_of_moderate_functions++
+        }
+        else
+        {
+            num_of_simple_functions++
+        }
+    }
+}
+
+function html_fnc_table_complete (caption)
+{
+    html_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
+}
+
+function html_fnc_table_abbrev (caption)
+{
+    html_fnc_table(caption, 1, 1, 0, 0, 1, 0, 0)
+}
+
+
+function html_fnc_table (caption,
+                         fname_p,
+                         mcyclo_p,
+                         cyclo_p,
+                         num_statements_p,
+                         num_lines_p,
+                         first_line_p,
+                         file_p)
+{
+    print "<table width=\"90%\" class=\"function_table\" cellpadding=\"0\" cellspacing=\"0\">"
+    if (caption != "")
+    {
+        print "<caption class=\"function_table_caption\">" caption "</caption>"
+    }
+    html_fnc_header(fname_p,
+                    mcyclo_p,
+                    cyclo_p,
+                    num_statements_p,
+                    num_lines_p,
+                    first_line_p,
+                    file_p)
+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
+    {
+        html_fnc(nfnc,
+                 fname_p,
+                 mcyclo_p,
+                 cyclo_p,
+                 num_statements_p,
+                 num_lines_p,
+                 first_line_p,
+                 file_p)
+    }
+    print "</table>"
+}
+
+function html_header ()
+{
+    print html_doctype
+    print "<html>"
+    print html_comment
+    print "<head>"
+    print "<title>" html_title "</title>"
+    print ""
+    print "<meta name=\"description\" content=\"" html_title "\">"
+    print "<meta name=\"keywords\" content=\"" html_title "\">"
+    print "<meta name=\"resource-type\" content=\"document\">"
+    print "<meta name=\"distribution\" content=\"global\">"
+    print "<meta name=\"Generator\" content=\"pmccabe2html\">"
+    print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"
+    print "<script language=\"javascript\" type=\"text/javascript\">"
+    print "function show_hide(idCapa, idButton, fuerzaVisibilidad)\
+{\
+        var button = document.getElementById(idButton);\
+	var capa = document.getElementById(idCapa);\
+	if (capa)\
+	{\
+		if (fuerzaVisibilidad && fuerzaVisibilidad!=\"\") {\
+			if (fuerzaVisibilidad==\"visible\") capa.style.display=\"\";\
+			else capa.style.display=\"none\";\
+		}\
+		else\
+		{\
+			if (capa.style.display == \"none\") {\
+				capa.style.display = \"\";\
+                                button.innerHTML = \"&uarr;\";\
+			} else {\
+				capa.style.display = \"none\";\
+                                button.innerHTML = \"&darr;\";     \
+			}\
+		}\
+	}\
+}"
+    print "</script>"
+
+
+    if (css_url != "")
+    {
+        print "<link rel=\"stylesheet\" href=\"" css_url "\" type =\"text/css\" media=\"screen\"/>"
+    }
+    if (css != "")
+    {
+        print "<style type =\"text/css\" media=\"screen\">"
+	print "<!--"
+        while ((getline cssline < css) > 0)
+        {
+	    print cssline
+	}
+        print "-->"
+	print "</style />"
+	close(css)
+    }
+    print "</head>"
+    print "<body lang=\"en\" bgcolor=\"#FFFFFF\" text=\"#000000\" link=\"#0000FF\" \
+vlink=\"#800080\" alink=\"#FF0000\">"
+}
+
+function html_footer ()
+{
+    print "</body>"
+    print "</html>"
+}
+
+function html_fnc_header (fname_p,
+                          mcyclo_p,
+                          cyclo_p,
+                          num_statements_p,
+                          num_lines_p,
+                          first_line_p,
+                          file_p)
+{
+    print "<tr class=\"function_table_header\">"
+    if (fname_p)
+    {
+        # Function name
+        print "<td class=\"function_table_header_entry\">"
+        print ""
+        print "</td>"
+
+        print "<td class=\"function_table_header_entry\">"
+        print "Function Name"
+        print "</td>"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "<td class=\"function_table_header_entry\">"
+        print "Modified Cyclo"
+        print "</td>"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "<td class=\"function_table_header_entry\">"
+        print "Cyclomatic"
+        print "<br/>"
+        print "Complexity"
+        print "</td>"
+    }
+    if (num_statements_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Number of"
+        print "<br/>"
+        print "Statements"
+        print "</td>"
+    }
+    if (num_lines_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Number of"
+        print "<br/>"
+        print "Lines"
+        print "</td>"
+    }
+    if (first_line_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "First Line"
+        print "</td>"
+    }
+    if (file_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Source File"
+        print "</td>"
+
+    }
+    print "</tr>"
+}
+
+function html_fnc (nfun,
+                   fname_p,
+                   mcyclo_p,
+                   cyclo_p,
+                   num_statements_p,
+                   num_lines_p,
+                   first_line_p,
+                   file_p)
+{
+    fname = fnames[nfun]
+
+    # Function name
+    trclass = "function_entry_simple"
+    if (mcyclo[nfun] > cyclo_high_max)
+    {
+        trclass="function_entry_untestable"
+    }
+    else if (mcyclo[nfun] > cyclo_moderate_max)
+    {
+        trclass="function_entry_high"
+    }
+    else if (mcyclo[nfun] > cyclo_simple_max)
+    {
+        trclass="function_entry_moderate"
+    }
+
+    print "<tr class=\"" trclass "\">"
+    if (fname_p)
+    {
+        print "<td class=\"function_entry_filename\">"
+        if (file_p && mcyclo[nfun] > cyclo_simple_max)
+        {
+            print "<a href=\"javascript:void(0);\" title=\"show/hide function source\" onClick=\"javascript:show_hide('" fname "_src', '" fname "_button')\">\
+<span id=\"" fname "_button\">&darr;</span></a>"
+        }
+        else
+        {
+            print "&nbsp;"
+        }
+        print "</td>"
+
+        print "<td class=\"function_entry_name\">"
+        print fname
+        print "</td>"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "<td class=\"function_entry_cyclo\">"
+        print mcyclo[nfun]
+        print "</td>"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "<td class=\"function_entry_cyclo\">"
+        print cyclo[nfun]
+        print "</td>"
+    }
+    if (num_statements_p)
+    {
+        # Number of statements
+        print "<td class=\"function_entry_number\">"
+        print num_statements[nfun]
+        print "</td>"
+    }
+    if (num_lines_p)
+    {
+        # Number of lines
+        print "<td class=\"function_entry_number\">"
+        print num_lines[nfun]
+        print "</td>"
+    }
+    if (first_line_p)
+    {
+        # First line
+        print "<td class=\"function_entry_number\">"
+        print first_line[nfun]
+        print "</td>"
+    }
+    if (file_p)
+    {
+        href = ""
+        if (source_file_link_tmpl != "")
+        {
+            # Get href target
+            href = source_file_link_tmpl
+            sub(/%FILENAME%/, file[nfun], href)
+        }
+
+        # Source file
+        print "<td class=\"function_entry_filename\">"
+        if (href != "")
+        {
+            print "<a href=\"" href "\">" file[nfun] "</a>"
+        }
+        else
+        {
+            print file[nfun]
+        }
+
+        print "</td>"
+
+
+        print "</tr>"
+
+        if (mcyclo[nfun] > cyclo_simple_max)
+        {
+            print "<tr>"
+
+            num_columns = 1;
+            if (fname_p) { num_columns++ }
+            if (mcyclo_p) { num_columns++ }
+            if (cyclo_p) { num_columns++ }
+            if (num_statements_p) { num_columns++ }
+            if (num_lines_p) { num_columns++ }
+            if (first_line_p) { num_columns++ }
+            if (file_p) { num_columns++ }
+
+            print "<td colspan=\"" num_columns "\" height=\"0\">"
+            print "<div id=\"" fname "_src\" class=\"function_src\" style=\"position: relative; display: none;\">"
+            print "<pre class=\"function_src\">"
+
+            while ((getline codeline < (fname nfun "_fn.txt")) > 0)
+            {
+                sub(/\\</, "&lt;", codeline)
+                sub(/\\>/, "&gt;", codeline)
+                sub(/&/, "&amp;", codeline)
+
+                print codeline
+            }
+            close(fname nfun "_fn.txt")
+            system("rm " "'" fname "'" nfun "_fn.txt")
+            print "</pre>"
+            print "</div>"
+            print "</td>"
+            print "</tr>"
+        }
+
+    }
+}
+
+function html_global_stats ()
+{
+    print "<div class=\"section_title\">Summary</div>"
+
+    print "<table class=\"summary_table\">"
+    # Total number of functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Total number of functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_functions
+    print "</td>"
+    print "</tr>"
+    # Number of simple functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of low risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_simple_functions
+    print "</td>"
+    print "</tr>"
+    # Number of moderate functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of moderate risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_moderate_functions
+    print "</td>"
+    print "</tr>"
+    # Number of high functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of high risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_high_functions
+    print "</td>"
+    print "</tr>"
+    # Number of untestable functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of untestable functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_untestable_functions
+    print "</td>"
+    print "</tr>"
+    print "</table>"
+    print "<br/>"
+}
+
+function html_function_cyclo ()
+{
+    print "<div class=\"section_title\">Details for all functions</div>"
+
+    print "<table class=\"ranges_table\">"
+    print "<tr>"
+    print "<td class=\"ranges_header_entry\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_header_entry\">"
+    print "Cyclomatic Complexity"
+    print "</td>"
+    print "<td class=\"ranges_header_entry\">"
+    print "Risk Evaluation"
+    print "</td>"
+    print "</tr>"
+    # Simple
+    print "<tr>"
+    print "<td class=\"ranges_entry_simple\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "0 - " cyclo_simple_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Simple module, without much risk"
+    print "</td>"
+    print "</tr>"
+    # Moderate
+    print "<tr>"
+    print "<td class=\"ranges_entry_moderate\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print cyclo_simple_max + 1 " - " cyclo_moderate_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "More complex module, moderate risk"
+    print "</td>"
+    print "</tr>"
+    # High
+    print "<tr>"
+    print "<td class=\"ranges_entry_high\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print cyclo_moderate_max + 1 " - " cyclo_high_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Complex module, high risk"
+    print "</td>"
+    print "</tr>"
+    # Untestable
+    print "<tr>"
+    print "<td class=\"ranges_entry_untestable\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "greater than " cyclo_high_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Untestable module, very high risk"
+    print "</td>"
+    print "</tr>"
+    print "</table>"
+    print "<br/>"
+    html_fnc_table_complete("")
+}
+
+function wiki_global_stats ()
+{
+    print "{| class=\"cyclo_summary_table\""
+    # Total number of functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Total number of functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_functions
+    # Number of simple functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of low risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_simple_functions
+    # Number of moderate functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of moderate risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_moderate_functions
+    # Number of high functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of high risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_high_functions
+    # Number of untestable functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of untestable functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_untestable_functions
+    print "|}"
+}
+
+function wiki_function_cyclo ()
+{
+    print "==Details for all functions=="
+
+    print "Used ranges:"
+
+    print "{| class =\"cyclo_ranges_table\""
+    print "|-"
+    print "| class=\"cyclo_ranges_header_entry\" | "
+    print "| class=\"cyclo_ranges_header_entry\" | Cyclomatic Complexity"
+    print "| class=\"cyclo_ranges_header_entry\" | Risk Evaluation"
+    # Simple
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_simple\" | "
+    print "| class=\"cyclo_ranges_entry\" | 0 - " cyclo_simple_max
+    print "| class=\"cyclo_ranges_entry\" | Simple module, without much risk"
+    # Moderate
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_moderate\" | "
+    print "| class=\"cyclo_ranges_entry\" |" cyclo_simple_max + 1 " - " cyclo_moderate_max
+    print "| class=\"cyclo_ranges_entry\" | More complex module, moderate risk"
+    # High
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_high\" | "
+    print "| class=\"cyclo_ranges_entry\" |" cyclo_moderate_max + 1 " - " cyclo_high_max
+    print "| class=\"cyclo_ranges_entry\" | Complex module, high risk"
+    # Untestable
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_untestable\" | "
+    print "| class=\"cyclo_ranges_entry\" | greater than " cyclo_high_max
+    print "| class=\"cyclo_ranges_entry\" | Untestable module, very high risk"
+    print "|}"
+
+    print ""
+    print ""
+    wiki_fnc_table_complete("")
+}
+
+function wiki_fnc_table_complete (caption)
+{
+    wiki_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
+}
+
+function wiki_fnc_table_abbrev (caption)
+{
+    wiki_fnc_table(caption, 1, 0, 0, 0, 0, 0, 0)
+}
+
+function wiki_fnc_table (caption,
+                         fname_p,
+                         mcyclo_p,
+                         cyclo_p,
+                         num_statements_p,
+                         num_lines_p,
+                         first_line_p,
+                         file_p)
+{
+    print "{| width=\"90%\" class=\"cyclo_function_table\" cellpadding=\"0\" cellspacing=\"0\">"
+    if (caption != "")
+    {
+        print "|+" caption
+    }
+    wiki_fnc_header(fname_p,
+                    mcyclo_p,
+                    cyclo_p,
+                    num_statements_p,
+                    num_lines_p,
+                    first_line_p,
+                    file_p)
+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
+    {
+        wiki_fnc(nfnc,
+                 fname_p,
+                 mcyclo_p,
+                 cyclo_p,
+                 num_statements_p,
+                 num_lines_p,
+                 first_line_p,
+                 file_p)
+    }
+    print "|}"
+}
+
+function wiki_fnc_header (fname_p,
+                          mcyclo_p,
+                          cyclo_p,
+                          num_statements_p,
+                          num_lines_p,
+                          first_line_p,
+                          file_p)
+{
+    if (fname_p)
+    {
+        # Function name
+        print "! class=\"cyclo_function_table_header_entry\" | Function Name"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "! class=\"cyclo_function_table_header_entry\" | Modified Cyclo"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "! class=\"cyclo_function_table_header_entry\" | Cyclomatic Complexity"
+    }
+    if (num_statements_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Number of Statements"
+    }
+    if (num_lines_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Number of Lines"
+    }
+    if (first_line_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | First Line"
+    }
+    if (file_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Source File"
+    }
+}
+
+function wiki_fnc (nfnc,
+                   fname_p,
+                   mcyclo_p,
+                   cyclo_p,
+                   num_statements_p,
+                   num_lines_p,
+                   first_line_p,
+                   file_p)
+{
+   fname = fnames[nfnc]
+
+    # Function name
+    trclass = "cyclo_function_entry_simple"
+    if (mcyclo[nfnc] > cyclo_high_max)
+    {
+        trclass="cyclo_function_entry_untestable"
+    }
+    else if (mcyclo[nfnc] > cyclo_moderate_max)
+    {
+        trclass="cyclo_function_entry_high"
+    }
+    else if (mcyclo[nfnc] > cyclo_simple_max)
+    {
+        trclass="cyclo_function_entry_moderate"
+    }
+
+    print "|- class=\"" trclass "\""
+    if (fname_p)
+    {
+        print "| class=\"cyclo_function_entry_name\" |" fname
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "| class=\"cyclo_function_entry_cyclo\" |" mcyclo[nfnc]
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "| class=\"cyclo_function_entry_cyclo\" |" cyclo[nfnc]
+    }
+    if (num_statements_p)
+    {
+        # Number of statements
+        print "| class=\"cyclo_function_entry_number\" |" num_statements[nfnc]
+    }
+    if (num_lines_p)
+    {
+        # Number of lines
+        print "| class=\"cyclo_function_entry_number\" |" num_lines[nfnc]
+    }
+    if (first_line_p)
+    {
+        # First line
+        print "| class=\"cyclo_function_entry_number\" |" first_line[nfnc]
+    }
+    if (file_p)
+    {
+        href = ""
+        if (source_file_link_tmpl != "")
+        {
+            # Get href target
+            href = source_file_link_tmpl
+            sub(/%FILENAME%/, file[nfnc], href)
+        }
+
+        # Source file
+        print "| class=\"cyclo_function_entry_filename\" |" \
+            ((href != "") ? "[" href " " file[nfnc] "]" : "[" file[nfnc] "]")
+    }
+}
+
+# Scan data from a line
+{
+    function_name = $7
+
+    nfuncs++;
+    fnames[nfuncs] = function_name
+    mcyclo[nfuncs] = $1
+    cyclo[nfuncs] = $2
+    num_statements[nfuncs] = $3
+    first_line[nfuncs] = $4
+    num_lines[nfuncs] = $5
+
+    # Build the filename from the file_spec ($6)
+    begin_util_path = index($6, cut_dir)
+    tmpfilename = substr($6, begin_util_path + length(cut_dir))
+    sub(/\([0-9]+\):/, "", tmpfilename)
+    file[nfuncs] = tmpfilename
+
+    if (mcyclo[nfuncs] > cyclo_simple_max)
+    {
+        # Extract function contents to a fn_txt file
+        filepath = $6
+
+        sub(/\([0-9]+\):/, "", filepath)
+        num_line = 0
+
+        while ((getline codeline < filepath) > 0)
+        {
+            num_line++;
+            if ((num_line >= first_line[nfuncs]) &&
+                (num_line < first_line[nfuncs] + num_lines[nfuncs]))
+            {
+                print codeline > (function_name nfuncs "_fn.txt")
+            }
+        }
+        close (function_name nfuncs "_fn.txt")
+        close(filepath)
+    }
+
+    # Initial values for statistics variables
+    num_of_functions = 0
+    max_mcyclo = 0
+    max_function_length = 0
+    num_of_simple_functions = 0
+    num_of_moderate_functions = 0
+    num_of_high_functions = 0
+    num_of_untestable_functions = 0
+}
+
+# Epilogue
+END {
+    # Print header (only for html)
+    if (output_lang == "html")
+    {
+        html_header()
+    }
+
+    # Print prolog
+    if ((output_lang == "html") &&
+        (html_prolog != ""))
+    {
+        print html_prolog
+    }
+    if ((output_lang == "wiki") &&
+        (wiki_prolog != ""))
+    {
+        print wiki_prolog
+    }
+
+    if (output_lang == "html")
+    {
+        print "<div class=\"page_title\">" package_name " Cyclomatic Complexity Report</div>"
+        print "<p>Report generated at: <span class=\"report_timestamp\">" strftime() "</div></p>"
+    }
+    if (output_lang == "wiki")
+    {
+        print "==" package_name " Cyclomatic Complexity Report=="
+        print "Report generated at: '''" strftime() "'''"
+    }
+
+    if (section_global_stats_p)
+    {
+        build_stats()
+
+        if (output_lang == "html")
+        {
+            html_global_stats()
+        }
+        if (output_lang == "wiki")
+        {
+            wiki_global_stats()
+        }
+    }
+    if (section_function_cyclo_p)
+    {
+        if (output_lang == "html")
+        {
+            html_function_cyclo()
+        }
+        if (output_lang == "wiki")
+        {
+            wiki_function_cyclo()
+        }
+    }
+
+    # Print epilog
+    if ((output_lang == "html") &&
+        (html_epilog != ""))
+    {
+        print html_epilog
+    }
+    if ((output_lang == "wiki") &&
+        (wiki_epilog != ""))
+    {
+        print wiki_epilog
+    }
+
+    # Print footer (html only)
+    if (output_lang == "html")
+    {
+        html_footer()
+    }
+}
+
+# End of pmccabe2html

+ 81 - 32
configure.ac

@@ -16,7 +16,8 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AC_INIT([StarPU],1.1.0, [starpu-devel@lists.gforge.inria.fr], starpu)
+AC_INIT([StarPU], [1.1.0], [starpu-devel@lists.gforge.inria.fr],
+  [starpu], [http://runtime.bordeaux.inria.fr/StarPU/])
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_AUX_DIR([build-aux])
 
 
@@ -188,6 +189,7 @@ AC_ARG_ENABLE(quick-check, [AS_HELP_STRING([--enable-quick-check],
 if  test x$enable_quick_check = xyes; then
 if  test x$enable_quick_check = xyes; then
 	AC_DEFINE(STARPU_QUICK_CHECK, [1], [enable quick check])
 	AC_DEFINE(STARPU_QUICK_CHECK, [1], [enable quick check])
 fi
 fi
+AM_CONDITIONAL([STARPU_QUICK_CHECK], [test "x$enable_quick_check" = "xyes"])
 
 
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
 
@@ -411,7 +413,12 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 	CPPFLAGS="${SAVED_CPPFLAGS}"
 	CPPFLAGS="${SAVED_CPPFLAGS}"
 	unset STARPU_CUDA_LDFLAGS
 	unset STARPU_CUDA_LDFLAGS
     else
     else
-	if test x$starpu_windows != xyes ; then
+	# nvcc is a wrapper around GCC, and calls it with the -dumpspecs
+	# option, which is GCC specific. If $CC does not support -dumpspecs, we
+	# should let nvcc choose another compiler (by default, gcc, if it is
+	# installed). If gcc is not installed, the build will probably fail.
+	$CC -dumpspecs >/dev/null 2>&1
+	if test $? -eq 0 -a x$starpu_windows != xyes; then
 	    NVCCFLAGS="${NVCCFLAGS} -ccbin \${CC}"
 	    NVCCFLAGS="${NVCCFLAGS} -ccbin \${CC}"
 	fi
 	fi
 	if test "$__cuda_include_dir" != "no"; then
 	if test "$__cuda_include_dir" != "no"; then
@@ -803,8 +810,55 @@ AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices]
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
+AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
+				[Enable the use of an OpenCL simulator])],
+				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
+if test x$enable_opencl_simulator = xyes; then
+	enable_simgrid=yes
+	AC_DEFINE(STARPU_OPENCL_SIMULATOR, 1, [Define this to enable using an OpenCL simulator])
+fi
+
+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
+			[Enable simulating execution in simgrid])],
+			enable_simgrid=$enableval, enable_simgrid=no)
+if test x$enable_simgrid = xyes ; then
+	OLD_CLAGS=$CFLAGS
+	OLD_LDFLAGS=$LDFLAGS
+	if test -n "$SIMGRID_CFLAGS" ; then
+		CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
+	fi
+	if test -n "$SIMGRID_LIBS" ; then
+		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
+	fi
+	AC_HAVE_LIBRARY([simgrid], [],
+		[
+			AC_MSG_ERROR(Simgrid support needs simgrid installed)
+		]
+	)
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+		    		[[#include <msg/msg.h>]],
+				[[msg_host_t foo; ]]
+			    )],
+	                 [],
+	                 [
+			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
+		         ])
+	if test -z "$SIMGRID_LIBS" ; then
+		SIMGRID_LIBS=-lsimgrid
+	fi
+	CLAGS=$OLD_CFLAGS
+	LDFLAGS=$OLD_LDFLAGS
+	AC_DEFINE(STARPU_SIMGRID, 1, [Define this to enable simgrid execution])
+	# Avoid the starpu top thread compilation
+	enable_starpu_top=no
+	# We won't bind or detect anything
+	with_hwloc=no
+	# In simgrid, it's much better to let workers block than spinlock
+	enable_blocking=yes
+fi
+
 AC_MSG_CHECKING(whether blocking drivers should be disabled)
 AC_MSG_CHECKING(whether blocking drivers should be disabled)
-AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
+AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
 				enable_blocking=$enableval, enable_blocking=no)
 				enable_blocking=$enableval, enable_blocking=no)
 AC_MSG_RESULT($enable_blocking)
 AC_MSG_RESULT($enable_blocking)
 
 
@@ -826,7 +880,9 @@ AC_MSG_RESULT($enable_debug)
 
 
 if test x$enable_debug = xyes; then
 if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
 	CFLAGS="$CFLAGS -O0"
-	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+	if test x$enable_simgrid != xyes; then
+		AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+	fi
 else
 else
 	CFLAGS="-O3 $CFLAGS"
 	CFLAGS="-O3 $CFLAGS"
 fi
 fi
@@ -841,16 +897,6 @@ if test x$enable_fast = xyes; then
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 fi
 fi
 
 
-AC_MSG_CHECKING(whether memory status should be displayed)
-AC_ARG_ENABLE(memory-status, [AS_HELP_STRING([--enable-memory-status],
-			     [display memory status at the end of execution])],
-			     enable_memory_status=$enableval, enable_memory_status=no)
-AC_MSG_RESULT($enable_memory_status)
-if test x$enable_memory_status = xyes; then
-        AC_DEFINE(STARPU_MEMORY_STATUS, [1], [display memory status])
-fi
-
-
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
 			[display verbose debug messages])],
 			[display verbose debug messages])],
@@ -860,7 +906,6 @@ if test x$enable_verbose = xyes; then
 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
 fi
 fi
 
 
-
 AC_MSG_CHECKING(whether coverage testing should be enabled)
 AC_MSG_CHECKING(whether coverage testing should be enabled)
 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
 			[enable coverage checking])],
 			[enable coverage checking])],
@@ -873,7 +918,6 @@ if test x$enable_coverage = xyes; then
 	LDFLAGS="${LDFLAGS} --coverage"
 	LDFLAGS="${LDFLAGS} --coverage"
 fi
 fi
 
 
-
 # shall we use FxT to generate trace of the execution ?
 # shall we use FxT to generate trace of the execution ?
 AC_MSG_CHECKING(whether FxT traces should be generated)
 AC_MSG_CHECKING(whether FxT traces should be generated)
 AC_ARG_WITH(fxt, [AS_HELP_STRING([--with-fxt[=<dir>]], [generate fxt traces])],
 AC_ARG_WITH(fxt, [AS_HELP_STRING([--with-fxt[=<dir>]], [generate fxt traces])],
@@ -957,15 +1001,25 @@ AC_ARG_ENABLE(stats, [AS_HELP_STRING([--enable-stats],
 			enable_stats=$enableval, enable_stats=no)
 			enable_stats=$enableval, enable_stats=no)
 AC_MSG_RESULT($enable_stats)
 AC_MSG_RESULT($enable_stats)
 AC_SUBST(STATS, $enable_stats)
 AC_SUBST(STATS, $enable_stats)
-AC_SUBST(STARPU_DATA_STATS, $enable_stats)
-
+AC_SUBST(STARPU_ENABLE_STATS, $enable_stats)
 if test x$enable_stats = xyes; then
 if test x$enable_stats = xyes; then
-        AC_DEFINE(STARPU_DATA_STATS, [1], [enable statistics])
+        AC_DEFINE(STARPU_ENABLE_STATS, [1], [enable statistics])
+fi
+
+AC_MSG_CHECKING(whether memory stats should be displayed)
+AC_ARG_ENABLE(memory-stats, [AS_HELP_STRING([--enable-memory-stats],
+			     [enable memory stats])],
+			     enable_memory_stats=$enableval, enable_memory_stats=no)
+AC_MSG_RESULT($enable_memory_stats)
+if test x$enable_memory_stats = xyes; then
+        AC_DEFINE(STARPU_MEMORY_STATS, [1], [enable memory stats])
 fi
 fi
 
 
 AC_CHECK_HEADERS([glpk.h])
 AC_CHECK_HEADERS([glpk.h])
 STARPU_HAVE_LIBRARY(GLPK, [glpk])
 STARPU_HAVE_LIBRARY(GLPK, [glpk])
 AM_CONDITIONAL([STARPU_HAVE_GLPK], [test "x$build_sched_ctx_hypervisor" = "xyes"])
 AM_CONDITIONAL([STARPU_HAVE_GLPK], [test "x$build_sched_ctx_hypervisor" = "xyes"])
+AC_CHECK_HEADERS([Ayudame.h])
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                  Miscellaneous options for StarPU                           #
 #                  Miscellaneous options for StarPU                           #
@@ -1099,7 +1153,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
 running_mpi_check=no
 running_mpi_check=no
-if test -d "$srcdir/.svn" ; then
+if test -d "$srcdir/.svn" -o -d "$srcdir/.git" ; then
     running_mpi_check=yes
     running_mpi_check=yes
 fi
 fi
 if test x$enable_mpi_check = xyes ; then
 if test x$enable_mpi_check = xyes ; then
@@ -1721,16 +1775,9 @@ m4_ifdef([AM_SILENT_RULES],
 # Documentation                          #
 # Documentation                          #
 ##########################################
 ##########################################
 
 
-enable_build_doc=yes
-AC_CHECK_PROGS([CHECK_TEXI2DVI], [texi2dvi], "no")
-if test "$CHECK_TEXI2DVI" == "no" ; then
-    enable_build_doc=no
-else
-    AC_CHECK_PROGS([CHECK_TEX], [tex], "no")
-    if test "$CHECK_TEX" == "no" ; then
-	enable_build_doc=no
-    fi
-fi
+AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
+			[disable building of documentation])],
+			enable_build_doc=$enableval, enable_build_doc=yes)
 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 
 
 ###############################################################################
 ###############################################################################
@@ -1745,7 +1792,7 @@ AC_SUBST([LIBSTARPU_LDFLAGS])
 
 
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
 if test x$enable_perf_debug = xyes; then
 if test x$enable_perf_debug = xyes; then
-	LIBSTARPU_LINK=".libs/libstarpu-$STARPU_EFFECTIVE_VERSION.a $LIBSTARPU_LDFLAGS $HWLOC_LIBS $STARPU_CUDA_LDFLAGS $STARPU_OPENCL_LDFLAGS"
+	LIBSTARPU_LINK=".libs/libstarpu-$STARPU_EFFECTIVE_VERSION.a $LIBSTARPU_LDFLAGS $HWLOC_LIBS $SIMGRID_LIBS $STARPU_CUDA_LDFLAGS $STARPU_OPENCL_LDFLAGS"
 fi
 fi
 AC_SUBST([LIBSTARPU_LINK])
 AC_SUBST([LIBSTARPU_LINK])
 
 
@@ -1845,6 +1892,8 @@ AC_MSG_NOTICE([
 	       SOCL enabled:  $build_socl
 	       SOCL enabled:  $build_socl
                Scheduler Hypervisor: $build_sched_ctx_hypervisor
                Scheduler Hypervisor: $build_sched_ctx_hypervisor
                SOCL test suite: $run_socl_check
                SOCL test suite: $run_socl_check
+               simgrid enabled:                             $enable_simgrid
+               ayudame enabled:                             $ac_cv_header_Ayudame_h
 ])
 ])
 
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
@@ -1855,7 +1904,7 @@ To run the tests, you need to install the OCL implementation of ICD
 and set the variable SOCL_OCL_LIB_OPENCL to the location of the libOpenCL.so.])
 and set the variable SOCL_OCL_LIB_OPENCL to the location of the libOpenCL.so.])
 fi
 fi
 
 
-if test x"$have_valid_hwloc" = xno
+if test x"$have_valid_hwloc" = xno -a "$enable_simgrid" = "no"
 then
 then
   AC_MSG_NOTICE([
   AC_MSG_NOTICE([
 WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
 WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the

+ 2 - 1
doc/Makefile.am

@@ -38,11 +38,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/version.texi \
 	chapters/version.texi \
 	chapters/sched_ctx_hypervisor.texi
 	chapters/sched_ctx_hypervisor.texi
 
 
-MAINTAINERCLEANFILES = starpu.pdf
+MAINTAINERCLEANFILES = starpu.pdf starpu.html
 
 
 EXTRA_DIST = starpu.css
 EXTRA_DIST = starpu.css
 
 
 dist_pdf_DATA = starpu.pdf
 dist_pdf_DATA = starpu.pdf
+dist_html_DATA = starpu.html
 
 
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 
 

+ 7 - 8
doc/chapters/advanced-api.texi

@@ -7,15 +7,15 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
-* Defining a new data interface::  
-* Multiformat Data Interface::  
-* Task Bundles::                
-* Task Lists::                  
-* Using Parallel Tasks::       
+* Defining a new data interface::
+* Multiformat Data Interface::
+* Task Bundles::
+* Task Lists::
+* Using Parallel Tasks::
 * Scheduling Contexts::
 * Scheduling Contexts::
-* Defining a new scheduling policy::  
+* Defining a new scheduling policy::
 * Running drivers::
 * Running drivers::
-* Expert mode::                 
+* Expert mode::
 @end menu
 @end menu
 
 
 @node Defining a new data interface
 @node Defining a new data interface
@@ -892,4 +892,3 @@ Register a progression hook, to be called when workers are idle.
 @deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
 @deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
 Unregister a given progression hook.
 Unregister a given progression hook.
 @end deftypefun
 @end deftypefun
-

+ 18 - 10
doc/chapters/advanced-examples.texi

@@ -9,13 +9,13 @@
 @menu
 @menu
 * Using multiple implementations of a codelet::
 * Using multiple implementations of a codelet::
 * Enabling implementation according to capabilities::
 * Enabling implementation according to capabilities::
-* Task and Worker Profiling::   
+* Task and Worker Profiling::
 * Partitioning Data::
 * Partitioning Data::
-* Performance model example::   
-* Theoretical lower bound on execution time::  
-* Insert Task Utility::          
-* Data reduction::  
-* Temporary buffers::  
+* Performance model example::
+* Theoretical lower bound on execution time::
+* Insert Task Utility::
+* Data reduction::
+* Temporary buffers::
 * Parallel Tasks::
 * Parallel Tasks::
 * Debugging::
 * Debugging::
 * The multiformat interface::
 * The multiformat interface::
@@ -45,7 +45,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
     __m128 factor __attribute__((aligned(16)));
     __m128 factor __attribute__((aligned(16)));
     factor = _mm_set1_ps(*(float *) cl_arg);
     factor = _mm_set1_ps(*(float *) cl_arg);
 
 
-    unsigned int i;    
+    unsigned int i;
     for (i = 0; i < n_iterations; i++)
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
 @}
 @}
@@ -232,7 +232,7 @@ starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
                             NX, sizeof(vector[0]));
                             NX, sizeof(vector[0]));
 
 
 /* Partition the vector in PARTS sub-vectors */
 /* Partition the vector in PARTS sub-vectors */
-starpu_filter f =
+starpu_data_filter f =
 @{
 @{
     .filter_func = starpu_block_filter_func_vector,
     .filter_func = starpu_block_filter_func_vector,
     .nchildren = PARTS
     .nchildren = PARTS
@@ -456,8 +456,10 @@ solve it immediately and get the optimized minimum, in ms. Its @code{integer}
 parameter allows to decide whether integer resolution should be computed
 parameter allows to decide whether integer resolution should be computed
 and returned too.
 and returned too.
 
 
-The @code{deps} parameter tells StarPU whether to take tasks and implicit data
-dependencies into account. It must be understood that the linear programming
+The @code{deps} parameter tells StarPU whether to take tasks, implicit data, and tag
+dependencies into account. Tags released in a callback or similar
+are not taken into account, only tags associated with a task are.
+It must be understood that the linear programming
 problem size is quadratic with the number of tasks and thus the time to solve it
 problem size is quadratic with the number of tasks and thus the time to solve it
 will be very long, it could be minutes for just a few dozen tasks. You should
 will be very long, it could be minutes for just a few dozen tasks. You should
 probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
 probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
@@ -469,6 +471,10 @@ of @code{lp_solve}. For instance, we often just use
 @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
 @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
 the @code{-gr} option can also be quite useful.
 the @code{-gr} option can also be quite useful.
 
 
+Data transfer time can only be taken into account when @code{deps} is set. Only
+data transfers inferred from implicit data dependencies between tasks are taken
+into account.
+
 Setting @code{deps} to 0 will only take into account the actual computations
 Setting @code{deps} to 0 will only take into account the actual computations
 on processing units. It however still properly takes into account the varying
 on processing units. It however still properly takes into account the varying
 performances of kernels and processing units, which is quite more accurate than
 performances of kernels and processing units, which is quite more accurate than
@@ -965,6 +971,8 @@ gdb helpers are also provided to show the whole StarPU state:
 (gdb) help starpu
 (gdb) help starpu
 @end smallexample
 @end smallexample
 
 
+The Temanejo task debugger can also be used, see @ref{Task debugger}.
+
 @node The multiformat interface
 @node The multiformat interface
 @section The multiformat interface
 @section The multiformat interface
 It may be interesting to represent the same piece of data using two different
 It may be interesting to represent the same piece of data using two different

+ 42 - 14
doc/chapters/basic-api.texi

@@ -161,17 +161,17 @@ it is therefore necessary to disable asynchronous data transfers.
 This can also be specified at compilation time by giving to the
 This can also be specified at compilation time by giving to the
 configure script the option @code{--disable-asynchronous-copy}.
 configure script the option @code{--disable-asynchronous-copy}.
 
 
-@item @code{int disable_cuda_asynchronous_copy} (default = 0)
+@item @code{int disable_asynchronous_cuda_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
 This flag should be set to 1 to disable asynchronous copies between
 CPUs and CUDA accelerators. This can also be specified with the
 CPUs and CUDA accelerators. This can also be specified with the
-@code{STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY} environment variable.
+@code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY} environment variable.
 This can also be specified at compilation time by giving to the
 This can also be specified at compilation time by giving to the
 configure script the option @code{--disable-asynchronous-cuda-copy}.
 configure script the option @code{--disable-asynchronous-cuda-copy}.
 
 
-@item @code{int disable_opencl_asynchronous_copy} (default = 0)
+@item @code{int disable_asynchronous_opencl_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
 This flag should be set to 1 to disable asynchronous copies between
 CPUs and OpenCL accelerators. This can also be specified with the
 CPUs and OpenCL accelerators. This can also be specified with the
-@code{STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY} environment variable.
+@code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY} environment variable.
 The AMD implementation of OpenCL is known to
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
@@ -592,6 +592,7 @@ available on the given memory node instead of main memory.
 @menu
 @menu
 * Registering Data::
 * Registering Data::
 * Accessing Data Interfaces::
 * Accessing Data Interfaces::
+* Defining Interface::
 @end menu
 @end menu
 
 
 @node Registering Data
 @node Registering Data
@@ -771,7 +772,8 @@ The function also sets @var{count} to the size of the data handle by calling
 Unpack in @var{handle} the data located at @var{ptr} of size
 Unpack in @var{handle} the data located at @var{ptr} of size
 @var{count} as described by the interface of the data. The interface
 @var{count} as described by the interface of the data. The interface
 registered at @var{handle} must define a unpacking operation
 registered at @var{handle} must define a unpacking operation
-(@pxref{struct starpu_data_interface_ops}).
+(@pxref{struct starpu_data_interface_ops}). The memory at the address @code{ptr}
+is freed after calling the data unpacking operation.
 @end deftypefun
 @end deftypefun
 
 
 @node Accessing Variable Data Interfaces
 @node Accessing Variable Data Interfaces
@@ -1027,7 +1029,7 @@ Return a pointer to the non-zero values of the matrix designated by @var{interfa
 
 
 @defmac STARPU_BCSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
 @defmac STARPU_BCSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the array of non-zero values in the matrix designated
 Return a device handle for the array of non-zero values in the matrix designated
-by @var{interface}. The offset documented below has to be used in addition to 
+by @var{interface}. The offset documented below has to be used in addition to
 this.
 this.
 @end defmac
 @end defmac
 
 
@@ -1102,7 +1104,7 @@ Return a pointer to the non-zero values of the matrix designated by @var{interfa
 
 
 @defmac STARPU_CSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
 @defmac STARPU_CSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the array of non-zero values in the matrix designated
 Return a device handle for the array of non-zero values in the matrix designated
-by @var{interface}. The offset documented below has to be used in addition to 
+by @var{interface}. The offset documented below has to be used in addition to
 this.
 this.
 @end defmac
 @end defmac
 
 
@@ -1189,6 +1191,21 @@ Return the size of the elements registered into the matrix designated by
 @var{interface}.
 @var{interface}.
 @end defmac
 @end defmac
 
 
+@node Defining Interface
+@subsection Defining Interface
+
+Applications can provide their own interface. An example is provided in
+@code{examples/interface}. A few helpers are provided.
+
+@deftypefun uintptr_t starpu_allocate_buffer_on_node (uint32_t @var{dst_node}, size_t @var{size})
+Allocate @var{size} bytes on node @var{dst_node}. This returns 0 if allocation
+failed, the allocation method should then return -ENOMEM as allocated size.
+@end deftypefun
+
+@deftypefun void starpu_free_buffer_on_node (uint32_t @var{dst_node}, uintptr_t @var{data}, size_t @var{size})
+Free @var{data} of @var{size} bytes on node @var{dst_node}.
+@end deftypefun
+
 @node Data Partition
 @node Data Partition
 @section Data Partition
 @section Data Partition
 
 
@@ -1327,7 +1344,7 @@ vector represented by @var{father_interface} once partitioned in
 @deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 @deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned in
 vector represented by @var{father_interface} once partitioned in
-@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow 
+@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow
 
 
 The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
 The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
 
 
@@ -1527,7 +1544,7 @@ e.g. static storage case.
 @item @code{uint32_t where} (optional)
 @item @code{uint32_t where} (optional)
 Indicates which types of processing units are able to execute the
 Indicates which types of processing units are able to execute the
 codelet. The different values
 codelet. The different values
-@code{STARPU_CPU}, @code{STARPU_CUDA}, 
+@code{STARPU_CPU}, @code{STARPU_CUDA},
 @code{STARPU_OPENCL} can be combined to specify
 @code{STARPU_OPENCL} can be combined to specify
 on which types of processing units the codelet can be executed.
 on which types of processing units the codelet can be executed.
 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
@@ -2405,11 +2422,21 @@ Converts the given timespec @var{ts} into microseconds.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_bus_profiling_helper_display_summary (void)
 @deftypefun void starpu_bus_profiling_helper_display_summary (void)
-Displays statistics about the bus on stderr.
+Displays statistics about the bus on stderr. if the  environment
+variable @code{STARPU_BUS_STATS} is defined. The function is called
+automatically by @code{starpu_shutdown()}.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_worker_profiling_helper_display_summary (void)
 @deftypefun void starpu_worker_profiling_helper_display_summary (void)
-Displays statistics about the workers on stderr.
+Displays statistics about the workers on stderr if the environment
+variable @code{STARPU_WORKER_STATS} is defined. The function is called
+automatically by @code{starpu_shutdown()}.
+@end deftypefun
+
+@deftypefun void starpu_display_memory_stats ()
+Display statistics about the current data handles registered within
+StarPU. StarPU must have been configured with the option
+@code{----enable-memory-stats} (@pxref{Memory feedback}).
 @end deftypefun
 @end deftypefun
 
 
 @node CUDA extensions
 @node CUDA extensions
@@ -2568,7 +2595,8 @@ use (e.g. different programs on the different OpenCL devices, for
 relocation purpose for instance).
 relocation purpose for instance).
 
 
 @deftp {Data Type} {struct starpu_opencl_program}
 @deftp {Data Type} {struct starpu_opencl_program}
-Stores the OpenCL programs as compiled for the different OpenCL devices.
+Stores the OpenCL programs as compiled for the different OpenCL
+devices. The different fields are:
 @table @asis
 @table @asis
 @item @code{cl_program programs[STARPU_MAXOPENCLDEVS]}
 @item @code{cl_program programs[STARPU_MAXOPENCLDEVS]}
 Stores each program for each OpenCL device.
 Stores each program for each OpenCL device.
@@ -2589,10 +2617,11 @@ This function unloads an OpenCL compiled code.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
 @deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
+@anchor{starpu_opencl_load_program_source}
 Store the contents of the file @var{source_file_name} in the buffer
 Store the contents of the file @var{source_file_name} in the buffer
 @var{opencl_program_source}. The file @var{source_file_name} can be
 @var{opencl_program_source}. The file @var{source_file_name} can be
 located in the current directory, or in the directory specified by the
 located in the current directory, or in the directory specified by the
-environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, or in the
+environment variable @code{STARPU_OPENCL_PROGRAM_DIR} (@pxref{STARPU_OPENCL_PROGRAM_DIR}), or in the
 directory @code{share/starpu/opencl} of the installation directory of
 directory @code{share/starpu/opencl} of the installation directory of
 StarPU, or in the source directory of StarPU.
 StarPU, or in the source directory of StarPU.
 When the file is found, @code{located_file_name} is the full name of
 When the file is found, @code{located_file_name} is the full name of
@@ -2743,4 +2772,3 @@ This function blocks until the function has been executed on every appropriate
 processing units, so that it may not be called from a callback function for
 processing units, so that it may not be called from a callback function for
 instance.
 instance.
 @end deftypefun
 @end deftypefun
-

+ 16 - 16
doc/chapters/basic-examples.texi

@@ -7,10 +7,10 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
-* Compiling and linking options::  
+* Compiling and linking options::
 * Hello World::                 Submitting Tasks
 * Hello World::                 Submitting Tasks
-* Vector Scaling Using the C Extension::  
-* Vector Scaling Using StarPu's API::  
+* Vector Scaling Using the C Extension::
+* Vector Scaling Using StarPu's API::
 * Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
 * Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
 @end menu
 @end menu
 
 
@@ -52,8 +52,8 @@ to StarPU. You can either use the StarPU C extension (@pxref{C
 Extensions}) or directly use the StarPU's API.
 Extensions}) or directly use the StarPU's API.
 
 
 @menu
 @menu
-* Hello World using the C Extension::  
-* Hello World using StarPU's API::  
+* Hello World using the C Extension::
+* Hello World using StarPU's API::
 @end menu
 @end menu
 
 
 @node Hello World using the C Extension
 @node Hello World using the C Extension
@@ -116,10 +116,10 @@ The remainder of this section shows how to achieve the same result using
 StarPU's standard C API.
 StarPU's standard C API.
 
 
 @menu
 @menu
-* Required Headers::            
-* Defining a Codelet::          
-* Submitting a Task::           
-* Execution of Hello World::    
+* Required Headers::
+* Defining a Codelet::
+* Submitting a Task::
+* Execution of Hello World::
 @end menu
 @end menu
 
 
 @node Required Headers
 @node Required Headers
@@ -306,8 +306,8 @@ example using StarPU's API is given in the next sections.
 
 
 
 
 @menu
 @menu
-* Adding an OpenCL Task Implementation::  
-* Adding a CUDA Task Implementation::  
+* Adding an OpenCL Task Implementation::
+* Adding a CUDA Task Implementation::
 @end menu
 @end menu
 
 
 The simplest way to get started writing StarPU programs is using the C
 The simplest way to get started writing StarPU programs is using the C
@@ -576,7 +576,7 @@ this example is given in @ref{Full source code for the 'Scaling a
 Vector' example}.
 Vector' example}.
 
 
 @menu
 @menu
-* Source Code of Vector Scaling::  
+* Source Code of Vector Scaling::
 * Execution of Vector Scaling::  Running the program
 * Execution of Vector Scaling::  Running the program
 @end menu
 @end menu
 
 
@@ -701,10 +701,10 @@ Contrary to the previous examples, the task submitted in this example may not
 only be executed by the CPUs, but also by a CUDA device.
 only be executed by the CPUs, but also by a CUDA device.
 
 
 @menu
 @menu
-* Definition of the CUDA Kernel::  
-* Definition of the OpenCL Kernel::  
-* Definition of the Main Code::  
-* Execution of Hybrid Vector Scaling::  
+* Definition of the CUDA Kernel::
+* Definition of the OpenCL Kernel::
+* Definition of the Main Code::
+* Execution of Hybrid Vector Scaling::
 @end menu
 @end menu
 
 
 @node Definition of the CUDA Kernel
 @node Definition of the CUDA Kernel

+ 1 - 1
doc/chapters/benchmarks.texi

@@ -6,7 +6,7 @@
 
 
 @menu
 @menu
 * Task size overhead::           Overhead of tasks depending on their size
 * Task size overhead::           Overhead of tasks depending on their size
-* Data transfer latency::        Latency of data transfers 
+* Data transfer latency::        Latency of data transfers
 * Gemm::                         Matrix-matrix multiplication
 * Gemm::                         Matrix-matrix multiplication
 * Cholesky::                     Cholesky factorization
 * Cholesky::                     Cholesky factorization
 * LU::                           LU factorization
 * LU::                           LU factorization

+ 234 - 109
doc/chapters/configuration.texi

@@ -17,24 +17,28 @@
 The following arguments can be given to the @code{configure} script.
 The following arguments can be given to the @code{configure} script.
 
 
 @menu
 @menu
-* Common configuration::        
-* Configuring workers::         
-* Extension configuration::     
-* Advanced configuration::      
+* Common configuration::
+* Configuring workers::
+* Extension configuration::
+* Advanced configuration::
 @end menu
 @end menu
 
 
 @node Common configuration
 @node Common configuration
 @subsection Common configuration
 @subsection Common configuration
 
 
-@table @code
+@defvr {Configure option} --enable-debug
+Enable debugging messages.
+@end defvr
 
 
-@item --enable-debug
+@defvr {Configure option} --enable-debug
 Enable debugging messages.
 Enable debugging messages.
+@end defvr
 
 
-@item --enable-fast
+@defvr {Configure option} --enable-fast
 Disable assertion checks, which saves computation time.
 Disable assertion checks, which saves computation time.
+@end defvr
 
 
-@item --enable-verbose
+@defvr {Configure option} --enable-verbose
 Increase the verbosity of the debugging messages.  This can be disabled
 Increase the verbosity of the debugging messages.  This can be disabled
 at runtime by setting the environment variable @code{STARPU_SILENT} to
 at runtime by setting the environment variable @code{STARPU_SILENT} to
 any value.
 any value.
@@ -42,25 +46,35 @@ any value.
 @smallexample
 @smallexample
 % STARPU_SILENT=1 ./vector_scal
 % STARPU_SILENT=1 ./vector_scal
 @end smallexample
 @end smallexample
+@end defvr
 
 
-@item --enable-coverage
+@defvr {Configure option} --enable-coverage
 Enable flags for the @code{gcov} coverage tool.
 Enable flags for the @code{gcov} coverage tool.
+@end defvr
 
 
-@item --enable-quick-check
+@defvr {Configure option} --enable-quick-check
 Specify tests and examples should be run on a smaller data set, i.e
 Specify tests and examples should be run on a smaller data set, i.e
 allowing a faster execution time
 allowing a faster execution time
+@end defvr
 
 
-@item --with-hwloc
+@defvr {Configure option} --with-hwloc
 Specify hwloc should be used by StarPU. hwloc should be found by the
 Specify hwloc should be used by StarPU. hwloc should be found by the
 means of the tools @code{pkg-config}.
 means of the tools @code{pkg-config}.
+@end defvr
 
 
-@item --with-hwloc=@var{prefix}
+@defvr {Configure option} --with-hwloc=@var{prefix}
 Specify hwloc should be used by StarPU. hwloc should be found in the
 Specify hwloc should be used by StarPU. hwloc should be found in the
 directory specified by @var{prefix}.
 directory specified by @var{prefix}.
+@end defvr
 
 
-@item --without-hwloc
+@defvr {Configure option} --without-hwloc
 Specify hwloc should not be used by StarPU.
 Specify hwloc should not be used by StarPU.
-@end table
+@end defvr
+
+@defvr {Configure option} --disable-build-doc
+Disable the creation of the documentation. This should be done on a
+machine which does not have the tools @code{makeinfo} and @code{tex}.
+@end defvr
 
 
 Additionally, the @command{configure} script recognize many variables, which
 Additionally, the @command{configure} script recognize many variables, which
 can be listed by typing @code{./configure --help}. For example,
 can be listed by typing @code{./configure --help}. For example,
@@ -70,183 +84,227 @@ CUDA kernels.
 @node Configuring workers
 @node Configuring workers
 @subsection Configuring workers
 @subsection Configuring workers
 
 
-@table @code
-
-@item --enable-maxcpus=@var{count}
+@defvr {Configure option} --enable-maxcpus=@var{count}
 Use at most @var{count} CPU cores.  This information is then
 Use at most @var{count} CPU cores.  This information is then
 available as the @code{STARPU_MAXCPUS} macro.
 available as the @code{STARPU_MAXCPUS} macro.
+@end defvr
 
 
-@item --disable-cpu
+@defvr {Configure option} --disable-cpu
 Disable the use of CPUs of the machine. Only GPUs etc. will be used.
 Disable the use of CPUs of the machine. Only GPUs etc. will be used.
+@end defvr
 
 
-@item --enable-maxcudadev=@var{count}
+@defvr {Configure option} --enable-maxcudadev=@var{count}
 Use at most @var{count} CUDA devices.  This information is then
 Use at most @var{count} CUDA devices.  This information is then
 available as the @code{STARPU_MAXCUDADEVS} macro.
 available as the @code{STARPU_MAXCUDADEVS} macro.
+@end defvr
 
 
-@item --disable-cuda
+@defvr {Configure option} --disable-cuda
 Disable the use of CUDA, even if a valid CUDA installation was detected.
 Disable the use of CUDA, even if a valid CUDA installation was detected.
+@end defvr
 
 
-@item --with-cuda-dir=@var{prefix}
+@defvr {Configure option} --with-cuda-dir=@var{prefix}
 Search for CUDA under @var{prefix}, which should notably contain
 Search for CUDA under @var{prefix}, which should notably contain
 @file{include/cuda.h}.
 @file{include/cuda.h}.
+@end defvr
 
 
-@item --with-cuda-include-dir=@var{dir}
+@defvr {Configure option} --with-cuda-include-dir=@var{dir}
 Search for CUDA headers under @var{dir}, which should
 Search for CUDA headers under @var{dir}, which should
 notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
 notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
 value given to @code{--with-cuda-dir}.
 value given to @code{--with-cuda-dir}.
+@end defvr
 
 
-@item --with-cuda-lib-dir=@var{dir}
+@defvr {Configure option} --with-cuda-lib-dir=@var{dir}
 Search for CUDA libraries under @var{dir}, which should notably contain
 Search for CUDA libraries under @var{dir}, which should notably contain
 the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
 the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
 @code{/lib} appended to the value given to @code{--with-cuda-dir}.
 @code{/lib} appended to the value given to @code{--with-cuda-dir}.
+@end defvr
 
 
-@item --disable-cuda-memcpy-peer
+@defvr {Configure option} --disable-cuda-memcpy-peer
 Explicitly disable peer transfers when using CUDA 4.0.
 Explicitly disable peer transfers when using CUDA 4.0.
+@end defvr
 
 
-@item --enable-maxopencldev=@var{count}
+@defvr {Configure option} --enable-maxopencldev=@var{count}
 Use at most @var{count} OpenCL devices.  This information is then
 Use at most @var{count} OpenCL devices.  This information is then
 available as the @code{STARPU_MAXOPENCLDEVS} macro.
 available as the @code{STARPU_MAXOPENCLDEVS} macro.
+@end defvr
 
 
-@item --disable-opencl
+@defvr {Configure option} --disable-opencl
 Disable the use of OpenCL, even if the SDK is detected.
 Disable the use of OpenCL, even if the SDK is detected.
+@end defvr
 
 
-@item --with-opencl-dir=@var{prefix}
+@defvr {Configure option} --with-opencl-dir=@var{prefix}
 Search for an OpenCL implementation under @var{prefix}, which should
 Search for an OpenCL implementation under @var{prefix}, which should
 notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
 notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
 Mac OS).
 Mac OS).
+@end defvr
 
 
-@item --with-opencl-include-dir=@var{dir}
+@defvr {Configure option} --with-opencl-include-dir=@var{dir}
 Search for OpenCL headers under @var{dir}, which should notably contain
 Search for OpenCL headers under @var{dir}, which should notably contain
 @file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
 @file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
 @code{/include} appended to the value given to @code{--with-opencl-dir}.
 @code{/include} appended to the value given to @code{--with-opencl-dir}.
+@end defvr
 
 
-@item --with-opencl-lib-dir=@var{dir}
+@defvr {Configure option} --with-opencl-lib-dir=@var{dir}
 Search for an OpenCL library under @var{dir}, which should notably
 Search for an OpenCL library under @var{dir}, which should notably
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
+@end defvr
 
 
-@item --enable-maximplementations=@var{count}
+@defvr {Configure option} --enable-opencl-simulator
+Enable considering the provided OpenCL implementation as a simulator, i.e. use
+the kernel duration returned by OpenCL profiling information as wallclock time
+instead of the actual measured real time. This requires simgrid support.
+@end defvr
+
+@defvr {Configure option} --enable-maximplementations=@var{count}
 Allow for at most @var{count} codelet implementations for the same
 Allow for at most @var{count} codelet implementations for the same
 target device.  This information is then available as the
 target device.  This information is then available as the
 @code{STARPU_MAXIMPLEMENTATIONS} macro.
 @code{STARPU_MAXIMPLEMENTATIONS} macro.
+@end defvr
 
 
-@item ----enable-max-sched-ctxs=@var{count}
+@defvr {Configure option} --enable-max-sched-ctxs=@var{count}
 Allow for at most @var{count} scheduling contexts
 Allow for at most @var{count} scheduling contexts
 This information is then available as the
 This information is then available as the
 @code{STARPU_NMAX_SCHED_CTXS} macro.
 @code{STARPU_NMAX_SCHED_CTXS} macro.
+@end defvr
 
 
-@item --disable-asynchronous-copy
+@defvr {Configure option} --disable-asynchronous-copy
 Disable asynchronous copies between CPU and GPU devices.
 Disable asynchronous copies between CPU and GPU devices.
 The AMD implementation of OpenCL is known to
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
+@end defvr
 
 
-@item --disable-asynchronous-cuda-copy
+@defvr {Configure option} --disable-asynchronous-cuda-copy
 Disable asynchronous copies between CPU and CUDA devices.
 Disable asynchronous copies between CPU and CUDA devices.
+@end defvr
 
 
-@item --disable-asynchronous-opencl-copy
+@defvr {Configure option} --disable-asynchronous-opencl-copy
 Disable asynchronous copies between CPU and OpenCL devices.
 Disable asynchronous copies between CPU and OpenCL devices.
 The AMD implementation of OpenCL is known to
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
-@end table
+@end defvr
 
 
 @node Extension configuration
 @node Extension configuration
 @subsection Extension configuration
 @subsection Extension configuration
 
 
-@table @code
-
-@item --disable-socl
+@defvr {Configure option} --disable-socl
 Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
 Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
 default, it is enabled when an OpenCL implementation is found.
 default, it is enabled when an OpenCL implementation is found.
+@end defvr
 
 
-@item --disable-starpu-top
+@defvr {Configure option} --disable-starpu-top
 Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
 Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
 is enabled when the required dependencies are found.
 is enabled when the required dependencies are found.
+@end defvr
 
 
-@item --disable-gcc-extensions
+@defvr {Configure option} --disable-gcc-extensions
 Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
 Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
 enabled when the GCC compiler provides a plug-in support.
 enabled when the GCC compiler provides a plug-in support.
+@end defvr
 
 
-@item --with-mpicc=@var{path}
+@defvr {Configure option} --with-mpicc=@var{path}
 Use the @command{mpicc} compiler at @var{path}, for starpumpi
 Use the @command{mpicc} compiler at @var{path}, for starpumpi
 (@pxref{StarPU MPI support}).
 (@pxref{StarPU MPI support}).
-
-@end table
+@end defvr
 
 
 @node Advanced configuration
 @node Advanced configuration
 @subsection Advanced configuration
 @subsection Advanced configuration
 
 
-@table @code
-
-@item --enable-perf-debug
+@defvr {Configure option} --enable-perf-debug
 Enable performance debugging through gprof.
 Enable performance debugging through gprof.
+@end defvr
 
 
-@item --enable-model-debug
+@defvr {Configure option} --enable-model-debug
 Enable performance model debugging.
 Enable performance model debugging.
+@end defvr
 
 
-@item --enable-stats
+@defvr {Configure option} --enable-stats
 @c see ../../src/datawizard/datastats.c
 @c see ../../src/datawizard/datastats.c
-Enable gathering of memory transfer statistics.
+Enable gathering of various data statistics (@pxref{Data statistics}).
+@end defvr
 
 
-@item --enable-maxbuffers
+@defvr {Configure option} --enable-maxbuffers
 Define the maximum number of buffers that tasks will be able to take
 Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
+@end defvr
 
 
-@item --enable-allocation-cache
+@defvr {Configure option} --enable-allocation-cache
 Enable the use of a data allocation cache to avoid the cost of it with
 Enable the use of a data allocation cache to avoid the cost of it with
 CUDA. Still experimental.
 CUDA. Still experimental.
+@end defvr
 
 
-@item --enable-opengl-render
+@defvr {Configure option} --enable-opengl-render
 Enable the use of OpenGL for the rendering of some examples.
 Enable the use of OpenGL for the rendering of some examples.
 @c TODO: rather default to enabled when detected
 @c TODO: rather default to enabled when detected
+@end defvr
 
 
-@item --enable-blas-lib
+@defvr {Configure option} --enable-blas-lib
 Specify the blas library to be used by some of the examples. The
 Specify the blas library to be used by some of the examples. The
 library has to be 'atlas' or 'goto'.
 library has to be 'atlas' or 'goto'.
+@end defvr
 
 
-@item --disable-starpufft
+@defvr {Configure option} --disable-starpufft
 Disable the build of libstarpufft, even if fftw or cuFFT is available.
 Disable the build of libstarpufft, even if fftw or cuFFT is available.
+@end defvr
 
 
-@item --with-magma=@var{prefix}
+@defvr {Configure option} --with-magma=@var{prefix}
 Search for MAGMA under @var{prefix}.  @var{prefix} should notably
 Search for MAGMA under @var{prefix}.  @var{prefix} should notably
 contain @file{include/magmablas.h}.
 contain @file{include/magmablas.h}.
+@end defvr
 
 
-@item --with-fxt=@var{prefix}
+@defvr {Configure option} --with-fxt=@var{prefix}
 Search for FxT under @var{prefix}.
 Search for FxT under @var{prefix}.
 @url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
 @url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
 traces of scheduling events, which can then be rendered them using ViTE
 traces of scheduling events, which can then be rendered them using ViTE
 (@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
 (@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
 notably contain @code{include/fxt/fxt.h}.
 notably contain @code{include/fxt/fxt.h}.
+@end defvr
 
 
-@item --with-perf-model-dir=@var{dir}
+@defvr {Configure option} --with-perf-model-dir=@var{dir}
 Store performance models under @var{dir}, instead of the current user's
 Store performance models under @var{dir}, instead of the current user's
 home.
 home.
+@end defvr
 
 
-@item --with-goto-dir=@var{prefix}
+@defvr {Configure option} --with-goto-dir=@var{prefix}
 Search for GotoBLAS under @var{prefix}, which should notably contain @file{libgoto.so} or @file{libgoto2.so}.
 Search for GotoBLAS under @var{prefix}, which should notably contain @file{libgoto.so} or @file{libgoto2.so}.
+@end defvr
 
 
-@item --with-atlas-dir=@var{prefix}
+@defvr {Configure option} --with-atlas-dir=@var{prefix}
 Search for ATLAS under @var{prefix}, which should notably contain
 Search for ATLAS under @var{prefix}, which should notably contain
 @file{include/cblas.h}.
 @file{include/cblas.h}.
+@end defvr
 
 
-@item --with-mkl-cflags=@var{cflags}
+@defvr {Configure option} --with-mkl-cflags=@var{cflags}
 Use @var{cflags} to compile code that uses the MKL library.
 Use @var{cflags} to compile code that uses the MKL library.
+@end defvr
 
 
-@item --with-mkl-ldflags=@var{ldflags}
+@defvr {Configure option} --with-mkl-ldflags=@var{ldflags}
 Use @var{ldflags} when linking code that uses the MKL library.  Note
 Use @var{ldflags} when linking code that uses the MKL library.  Note
 that the
 that the
 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
 MKL website} provides a script to determine the linking flags.
 MKL website} provides a script to determine the linking flags.
+@end defvr
 
 
-@item --disable-build-examples
+@defvr {Configure option} --disable-build-examples
 Disable the build of examples.
 Disable the build of examples.
+@end defvr
+
 
 
-@item --enable-sched-ctx-hypervisor
-Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}). 
+@defvr {Configure option} --enable-sched-ctx-hypervisor
+Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}).
 By default, it is disabled.
 By default, it is disabled.
+@end defvr
 
 
-@end table
+@defvr {Configure option} --enable-memory-stats
+Enable memory statistics (@pxref{Memory feedback}).
+@end defvr
+
+@defvr {Configure option} --enable-simgrid
+Enable simulation of execution in simgrid, to allow easy experimentation with
+various numbers of cores and GPUs, or amount of memory, etc. Experimental.
+@end defvr
 
 
 @node Execution configuration through environment variables
 @node Execution configuration through environment variables
 @section Execution configuration through environment variables
 @section Execution configuration through environment variables
@@ -261,28 +319,41 @@ By default, it is disabled.
 @node Workers
 @node Workers
 @subsection Configuring workers
 @subsection Configuring workers
 
 
-@table @code
-
-@item @code{STARPU_NCPU}
-Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
+@defvr {Environment variable} STARPU_NCPU
+Specify the number of CPU workers (thus not including workers dedicated to control accelerators). Note that by default, StarPU will not allocate
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
 the accelerators.
 the accelerators.
+@end defvr
 
 
-@item @code{STARPU_NCUDA}
+@defvr {Environment variable} STARPU_NCPUS
+This variable is deprecated. You should use @code{STARPU_NCPU}.
+@end defvr
+
+@defvr {Environment variable} STARPU_NCUDA
 Specify the number of CUDA devices that StarPU can use. If
 Specify the number of CUDA devices that StarPU can use. If
 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
 possible to select which CUDA devices should be used by the means of the
 possible to select which CUDA devices should be used by the means of the
 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
 create as many CUDA workers as there are CUDA devices.
 create as many CUDA workers as there are CUDA devices.
+@end defvr
 
 
-@item @code{STARPU_NOPENCL}
+@defvr {Environment variable} STARPU_NOPENCL
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
+@end defvr
+
+@defvr {Environment variable} STARPU_OPENCL_ON_CPUS
+By default, the OpenCL driver only enables GPU and accelerator
+devices. By setting the environment variable
+@code{STARPU_OPENCL_ON_CPUS} to 1, the OpenCL driver will also enable
+CPU devices.
+@end defvr
 
 
-@item @code{STARPU_WORKERS_NOBIND}
+@defvr {Environment variable} STARPU_WORKERS_NOBIND
 Setting it to non-zero will prevent StarPU from binding its threads to
 Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
 CPUs. This is for instance useful when running the testsuite in parallel.
+@end defvr
 
 
-@item @code{STARPU_WORKERS_CPUID}
+@defvr {Environment variable} STARPU_WORKERS_CPUID
 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
 specifies on which logical CPU the different workers should be
 specifies on which logical CPU the different workers should be
 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
@@ -305,8 +376,9 @@ third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
 
 
 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
 @code{starpu_conf} structure passed to @code{starpu_init} is set.
 @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
 
-@item @code{STARPU_WORKERS_CUDAID}
+@defvr {Environment variable} STARPU_WORKERS_CUDAID
 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
 possible to select which CUDA devices should be used by StarPU. On a machine
 possible to select which CUDA devices should be used by StarPU. On a machine
 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
@@ -316,20 +388,22 @@ the one reported by CUDA).
 
 
 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
 
-@item @code{STARPU_WORKERS_OPENCLID}
+@defvr {Environment variable} STARPU_WORKERS_OPENCLID
 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
 
 
 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
 
-@item @code{STARPU_SINGLE_COMBINED_WORKER}
+@defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
 If set, StarPU will create several workers which won't be able to work
 If set, StarPU will create several workers which won't be able to work
 concurrently. It will create combined workers which size goes from 1 to the
 concurrently. It will create combined workers which size goes from 1 to the
 total number of CPU workers in the system.
 total number of CPU workers in the system.
+@end defvr
 
 
-@item @code{SYNTHESIZE_ARITY_COMBINED_WORKER}
-
+@defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
 Let the user decide how many elements are allowed between combined workers
 Let the user decide how many elements are allowed between combined workers
 created from hwloc information. For instance, in the case of sockets with 6
 created from hwloc information. For instance, in the case of sockets with 6
 cores without shared L2 caches, if @code{SYNTHESIZE_ARITY_COMBINED_WORKER} is
 cores without shared L2 caches, if @code{SYNTHESIZE_ARITY_COMBINED_WORKER} is
@@ -344,51 +418,55 @@ is already a normal worker for it).
 
 
 The default, 2, thus makes StarPU tend to building a binary trees of combined
 The default, 2, thus makes StarPU tend to building a binary trees of combined
 workers.
 workers.
+@end defvr
 
 
-@item @code{STARPU_DISABLE_ASYNCHRONOUS_COPY}
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_COPY
 Disable asynchronous copies between CPU and GPU devices.
 Disable asynchronous copies between CPU and GPU devices.
 The AMD implementation of OpenCL is known to
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
+@end defvr
 
 
-@item @code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY}
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
 Disable asynchronous copies between CPU and CUDA devices.
 Disable asynchronous copies between CPU and CUDA devices.
+@end defvr
 
 
-@item @code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY}
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
 Disable asynchronous copies between CPU and OpenCL devices.
 Disable asynchronous copies between CPU and OpenCL devices.
 The AMD implementation of OpenCL is known to
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
+@end defvr
 
 
-@item @code{STARPU_DISABLE_CUDA_GPU_GPU_DIRECT}
+@defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
 Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
 Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
 instead. This permits to test the performance effect of GPU-Direct.
 instead. This permits to test the performance effect of GPU-Direct.
-
-@end table
+@end defvr
 
 
 @node Scheduling
 @node Scheduling
 @subsection Configuring the Scheduling engine
 @subsection Configuring the Scheduling engine
 
 
-@table @code
-
-@item @code{STARPU_SCHED}
+@defvr {Environment variable} STARPU_SCHED
 Choose between the different scheduling policies proposed by StarPU: work
 Choose between the different scheduling policies proposed by StarPU: work
 random, stealing, greedy, with performance models, etc.
 random, stealing, greedy, with performance models, etc.
 
 
 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
+@end defvr
 
 
-@item @code{STARPU_CALIBRATE}
+@defvr {Environment variable} STARPU_CALIBRATE
 If this variable is set to 1, the performance models are calibrated during
 If this variable is set to 1, the performance models are calibrated during
 the execution. If it is set to 2, the previous values are dropped to restart
 the execution. If it is set to 2, the previous values are dropped to restart
 calibration from scratch. Setting this variable to 0 disable calibration, this
 calibration from scratch. Setting this variable to 0 disable calibration, this
 is the default behaviour.
 is the default behaviour.
 
 
 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
+@end defvr
 
 
-@item @code{STARPU_BUS_CALIBRATE}
+@defvr {Environment variable} STARPU_BUS_CALIBRATE
 If this variable is set to 1, the bus is recalibrated during intialization.
 If this variable is set to 1, the bus is recalibrated during intialization.
+@end defvr
 
 
-@item @code{STARPU_PREFETCH}
+@defvr {Environment variable} STARPU_PREFETCH
 @anchor{STARPU_PREFETCH}
 @anchor{STARPU_PREFETCH}
 This variable indicates whether data prefetching should be enabled (0 means
 This variable indicates whether data prefetching should be enabled (0 means
 that it is disabled). If prefetching is enabled, when a task is scheduled to be
 that it is disabled). If prefetching is enabled, when a task is scheduled to be
@@ -396,68 +474,115 @@ executed e.g. on a GPU, StarPU will request an asynchronous transfer in
 advance, so that data is already present on the GPU when the task starts. As a
 advance, so that data is already present on the GPU when the task starts. As a
 result, computation and data transfers are overlapped.
 result, computation and data transfers are overlapped.
 Note that prefetching is enabled by default in StarPU.
 Note that prefetching is enabled by default in StarPU.
+@end defvr
 
 
-@item @code{STARPU_SCHED_ALPHA}
+@defvr {Environment variable} STARPU_SCHED_ALPHA
 To estimate the cost of a task StarPU takes into account the estimated
 To estimate the cost of a task StarPU takes into account the estimated
 computation time (obtained thanks to performance models). The alpha factor is
 computation time (obtained thanks to performance models). The alpha factor is
 the coefficient to be applied to it before adding it to the communication part.
 the coefficient to be applied to it before adding it to the communication part.
+@end defvr
 
 
-@item @code{STARPU_SCHED_BETA}
+@defvr {Environment variable} STARPU_SCHED_BETA
 To estimate the cost of a task StarPU takes into account the estimated
 To estimate the cost of a task StarPU takes into account the estimated
 data transfer time (obtained thanks to performance models). The beta factor is
 data transfer time (obtained thanks to performance models). The beta factor is
 the coefficient to be applied to it before adding it to the computation part.
 the coefficient to be applied to it before adding it to the computation part.
+@end defvr
 
 
-@end table
+@defvr {Environment variable} STARPU_SCHED_GAMMA
+Define the execution time penalty of a joule (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_IDLE_POWER
+Define the idle power of the machine (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_PROFILING
+Enable on-line performance monitoring (@pxref{Enabling on-line performance monitoring}).
+@end defvr
 
 
 @node Extensions
 @node Extensions
 @subsection Extensions
 @subsection Extensions
 
 
-@table @code
-
-@item @code{SOCL_OCL_LIB_OPENCL}
+@defvr {Environment variable} SOCL_OCL_LIB_OPENCL
 THE SOCL test suite is only run when the environment variable
 THE SOCL test suite is only run when the environment variable
 @code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
 @code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
 of the libOpenCL.so file of the OCL ICD implementation.
 of the libOpenCL.so file of the OCL ICD implementation.
+@end defvr
 
 
-@item @code{STARPU_COMM_STATS}
+@defvr {Environment variable} STARPU_COMM_STATS
 @anchor{STARPU_COMM_STATS}
 @anchor{STARPU_COMM_STATS}
 Communication statistics for starpumpi (@pxref{StarPU MPI support})
 Communication statistics for starpumpi (@pxref{StarPU MPI support})
 will be enabled when the environment variable @code{STARPU_COMM_STATS}
 will be enabled when the environment variable @code{STARPU_COMM_STATS}
 is defined to an value other than 0.
 is defined to an value other than 0.
+@end defvr
 
 
-@item @code{STARPU_MPI_CACHE}
+@defvr {Environment variable} STARPU_MPI_CACHE
 @anchor{STARPU_MPI_CACHE}
 @anchor{STARPU_MPI_CACHE}
 Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
 Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
 disabled when the environment variable @code{STARPU_MPI_CACHE} is set
 disabled when the environment variable @code{STARPU_MPI_CACHE} is set
 to 0. It is enabled by default or for any other values of the variable
 to 0. It is enabled by default or for any other values of the variable
 @code{STARPU_MPI_CACHE}.
 @code{STARPU_MPI_CACHE}.
-@end table
+@end defvr
 
 
 @node Misc
 @node Misc
 @subsection Miscellaneous and debug
 @subsection Miscellaneous and debug
 
 
-@table @code
-
-@item @code{STARPU_SILENT}
+@defvr {Environment variable} STARPU_OPENCL_PROGRAM_DIR
+@anchor{STARPU_OPENCL_PROGRAM_DIR}
+This specifies the directory where the OpenCL codelet source files are
+located. The function @ref{starpu_opencl_load_program_source} looks
+for the codelet in the current directory, in the directory specified
+by the environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, in the
+directory @code{share/starpu/opencl} of the installation directory of
+StarPU, and finally in the source directory of StarPU.
+@end defvr
+
+@defvr {Environment variable} STARPU_SILENT
 This variable allows to disable verbose mode at runtime when StarPU
 This variable allows to disable verbose mode at runtime when StarPU
 has been configured with the option @code{--enable-verbose}. It also
 has been configured with the option @code{--enable-verbose}. It also
 disables the display of StarPU information and warning messages.
 disables the display of StarPU information and warning messages.
+@end defvr
 
 
-@item @code{STARPU_LOGFILENAME}
+@defvr {Environment variable} STARPU_LOGFILENAME
 This variable specifies in which file the debugging output should be saved to.
 This variable specifies in which file the debugging output should be saved to.
+@end defvr
 
 
-@item @code{STARPU_FXT_PREFIX}
+@defvr {Environment variable} STARPU_FXT_PREFIX
 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
+@end defvr
 
 
-@item @code{STARPU_LIMIT_GPU_MEM}
+@defvr {Environment variable} STARPU_LIMIT_GPU_MEM
 This variable specifies the maximum number of megabytes that should be
 This variable specifies the maximum number of megabytes that should be
 available to the application on each GPUs. In case this value is smaller than
 available to the application on each GPUs. In case this value is smaller than
 the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
 the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
 on the device. This variable is intended to be used for experimental purposes
 on the device. This variable is intended to be used for experimental purposes
 as it emulates devices that have a limited amount of memory.
 as it emulates devices that have a limited amount of memory.
+@end defvr
 
 
-@item @code{STARPU_GENERATE_TRACE}
+@defvr {Environment variable} STARPU_GENERATE_TRACE
 When set to 1, this variable indicates that StarPU should automatically
 When set to 1, this variable indicates that StarPU should automatically
-generate a Paje trace when starpu_shutdown is called.
-
-@end table
+generate a Paje trace when @code{starpu_shutdown()} is called.
+@end defvr
+
+@defvr {Environment variable} STARPU_MEMORY_STATS
+When set to 0, disable the display of memory statistics on data which
+have not been unregistered at the end of the execution (@pxref{Memory
+feedback}).
+@end defvr
+
+@defvr {Environment variable} STARPU_BUS_STATS
+When defined, statistics about data transfers will be displayed when calling
+@code{starpu_shutdown()} (@pxref{Profiling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_WORKER_STATS
+When defined, statistics about the workers will be displayed when calling
+@code{starpu_shutdown()} (@pxref{Profiling}). When combined with the
+environment variable @code{STARPU_PROFILING}, it displays the power
+consumption (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_STATS
+When set to 0, data statistics will not be displayed at the
+end of the execution of an application (@pxref{Data statistics}).
+@end defvr

+ 0 - 1
doc/chapters/fdl-1.3.texi

@@ -505,4 +505,3 @@ to permit their use in free software.
 @c Local Variables:
 @c Local Variables:
 @c ispell-local-pdict: "ispell-dict"
 @c ispell-local-pdict: "ispell-dict"
 @c End:
 @c End:
-

+ 10 - 10
doc/chapters/installing.texi

@@ -7,9 +7,9 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
-* Downloading StarPU::          
-* Configuration of StarPU::     
-* Building and Installing StarPU::  
+* Downloading StarPU::
+* Configuration of StarPU::
+* Building and Installing StarPU::
 @end menu
 @end menu
 
 
 StarPU can be built and installed by the standard means of the GNU
 StarPU can be built and installed by the standard means of the GNU
@@ -20,8 +20,8 @@ can be used to install StarPU.
 @section Downloading StarPU
 @section Downloading StarPU
 
 
 @menu
 @menu
-* Getting Sources::             
-* Optional dependencies::       
+* Getting Sources::
+* Optional dependencies::
 @end menu
 @end menu
 
 
 @node Getting Sources
 @node Getting Sources
@@ -69,8 +69,8 @@ of hwloc.
 @section Configuration of StarPU
 @section Configuration of StarPU
 
 
 @menu
 @menu
-* Generating Makefiles and configuration scripts::  
-* Running the configuration::   
+* Generating Makefiles and configuration scripts::
+* Running the configuration::
 @end menu
 @end menu
 
 
 @node Generating Makefiles and configuration scripts
 @node Generating Makefiles and configuration scripts
@@ -99,9 +99,9 @@ Details about options that are useful to give to @code{./configure} are given in
 @section Building and Installing StarPU
 @section Building and Installing StarPU
 
 
 @menu
 @menu
-* Building::                    
-* Sanity Checks::               
-* Installing::                  
+* Building::
+* Sanity Checks::
+* Installing::
 @end menu
 @end menu
 
 
 @node Building
 @node Building

+ 2 - 2
doc/chapters/introduction.texi

@@ -70,8 +70,8 @@ policies in a portable fashion (@pxref{Scheduling Policy API}).
 The remainder of this section describes the main concepts used in StarPU.
 The remainder of this section describes the main concepts used in StarPU.
 
 
 @menu
 @menu
-* Codelet and Tasks::           
-* StarPU Data Management Library::  
+* Codelet and Tasks::
+* StarPU Data Management Library::
 * Glossary::
 * Glossary::
 * Research Papers::
 * Research Papers::
 @end menu
 @end menu

+ 39 - 10
doc/chapters/mpi-support.texi

@@ -20,16 +20,24 @@ distributed application, by automatically issuing all required data transfers
 according to the task graph and an application-provided distribution.
 according to the task graph and an application-provided distribution.
 
 
 @menu
 @menu
-* The API::                     
-* Simple Example::              
-* Exchanging User Defined Data Interface::  
-* MPI Insert Task Utility::     
-* MPI Collective Operations::   
+* The API::
+* Simple Example::
+* Exchanging User Defined Data Interface::
+* MPI Insert Task Utility::
+* MPI Collective Operations::
 @end menu
 @end menu
 
 
 @node The API
 @node The API
 @section The API
 @section The API
 
 
+@menu
+* Compilation::
+* Initialisation::
+* Communication::
+* Communication cache::
+@end menu
+
+@node Compilation
 @subsection Compilation
 @subsection Compilation
 
 
 The flags required to compile or link against the MPI layer are then
 The flags required to compile or link against the MPI layer are then
@@ -42,21 +50,27 @@ accessible with the following commands:
 
 
 Also pass the @code{--static} option if the application is to be linked statically.
 Also pass the @code{--static} option if the application is to be linked statically.
 
 
+@node Initialisation
 @subsection Initialisation
 @subsection Initialisation
 
 
-@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv})
-Initializes the starpumpi library. If MPI is not already initialized,
-it will be by calling @code{MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)}.
+@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv}, int initialize_mpi)
+Initializes the starpumpi library. @code{initialize_mpi} indicates if
+MPI should be initialized or not by StarPU. If the value is not @code{0},
+MPI will be initialized by calling @code{MPI_Init_Thread(argc, argv,
+MPI_THREAD_SERIALIZED, ...)}.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_mpi_initialize (void)
 @deftypefun int starpu_mpi_initialize (void)
 This function has been made deprecated. One should use instead the
 This function has been made deprecated. One should use instead the
 function @code{starpu_mpi_init()} defined above.
 function @code{starpu_mpi_init()} defined above.
+This function does not call @code{MPI_Init}, it should be called beforehand.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
 This function has been made deprecated. One should use instead the
 This function has been made deprecated. One should use instead the
 function @code{starpu_mpi_init()} defined above.
 function @code{starpu_mpi_init()} defined above.
+MPI will be initialized by starpumpi by calling @code{MPI_Init_Thread(argc, argv,
+MPI_THREAD_SERIALIZED, ...)}.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun int starpu_mpi_shutdown (void)
 @deftypefun int starpu_mpi_shutdown (void)
@@ -73,6 +87,7 @@ to the world size. Communications statistics must be enabled
 (@pxref{STARPU_COMM_STATS}).
 (@pxref{STARPU_COMM_STATS}).
 @end deftypefun
 @end deftypefun
 
 
+@node Communication
 @subsection Communication
 @subsection Communication
 
 
 The standard point to point communications of MPI have been
 The standard point to point communications of MPI have been
@@ -165,6 +180,22 @@ node of the array @var{source} using the n-th message tag of the array
 On completion of the all the requests, @var{tag} is unlocked.
 On completion of the all the requests, @var{tag} is unlocked.
 @end deftypefun
 @end deftypefun
 
 
+@node Communication cache
+@subsection Communication cache
+
+@deftypefun void starpu_mpi_cache_flush (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle})
+Clear the send and receive communication cache for the data
+@var{data_handle}. The function has to be called synchronously by all
+the MPI nodes.
+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
+@end deftypefun
+
+@deftypefun void starpu_mpi_cache_flush_all_data (MPI_Comm @var{comm})
+Clear the send and receive communication cache for all data. The
+function has to be called synchronously by all the MPI nodes.
+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
+@end deftypefun
+
 @page
 @page
 @node Simple Example
 @node Simple Example
 @section Simple Example
 @section Simple Example
@@ -561,5 +592,3 @@ for(x = 0; x < nblocks ;  x++) @{
 starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
 starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
 @end smallexample
 @end smallexample
 @end cartouche
 @end cartouche
-
-

+ 131 - 5
doc/chapters/perf-feedback.texi

@@ -7,17 +7,38 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
+* Task debugger::               Using the Temanejo task debugger
 * On-line::                     On-line performance feedback
 * On-line::                     On-line performance feedback
 * Off-line::                    Off-line performance feedback
 * Off-line::                    Off-line performance feedback
 * Codelet performance::         Performance of codelets
 * Codelet performance::         Performance of codelets
-* Theoretical lower bound on execution time API::  
+* Theoretical lower bound on execution time API::
+* Memory feedback::
+* Data statistics::
 @end menu
 @end menu
 
 
+@node Task debugger
+@section Using the Temanejo task debugger
+
+StarPU can connect to Temanejo (see
+@url{http://www.hlrs.de/temanejo}), to permit
+nice visual task debugging. To do so, build Temanejo's @code{libayudame.so},
+install @code{Ayudame} to e.g. @code{/usr/local/include}, apply the
+@code{tools/patch-ayudame} to it to fix C build, re-@code{./configure}, make
+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
+to your application, any options you want to pass it, the path to libayudame.so.
+
+Make sure to specify at least the same number of CPUs in the dialog box as your
+machine has, otherwise an error will happen during execution. Future versions
+of Temanejo should be able to tell StarPU the number of CPUs to use.
+
+Tag numbers have to be below @code{4000000000000000000ULL} to be usable for
+Temanejo (so as to distinguish them from tasks).
+
 @node On-line
 @node On-line
 @section On-line performance feedback
 @section On-line performance feedback
 
 
 @menu
 @menu
-* Enabling monitoring::         Enabling on-line performance monitoring
+* Enabling on-line performance monitoring::
 * Task feedback::               Per-task feedback
 * Task feedback::               Per-task feedback
 * Codelet feedback::            Per-codelet feedback
 * Codelet feedback::            Per-codelet feedback
 * Worker feedback::             Per-worker feedback
 * Worker feedback::             Per-worker feedback
@@ -25,7 +46,7 @@
 * StarPU-Top::                  StarPU-Top interface
 * StarPU-Top::                  StarPU-Top interface
 @end menu
 @end menu
 
 
-@node Enabling monitoring
+@node Enabling on-line performance monitoring
 @subsection Enabling on-line performance monitoring
 @subsection Enabling on-line performance monitoring
 
 
 In order to enable online performance monitoring, the application can call
 In order to enable online performance monitoring, the application can call
@@ -87,7 +108,7 @@ because there is no task to execute at all (@code{sleeping_time}), and the
 number of tasks that were executed while profiling was enabled.
 number of tasks that were executed while profiling was enabled.
 These values give an estimation of the proportion of time spent do real work,
 These values give an estimation of the proportion of time spent do real work,
 and the time spent either sleeping because there are not enough executable
 and the time spent either sleeping because there are not enough executable
-tasks or simply wasted in pure StarPU overhead. 
+tasks or simply wasted in pure StarPU overhead.
 
 
 Calling @code{starpu_worker_get_profiling_info} resets the profiling
 Calling @code{starpu_worker_get_profiling_info} resets the profiling
 information associated to a worker.
 information associated to a worker.
@@ -98,7 +119,7 @@ generate a graphic showing the evolution of these values during the time, for
 the different workers.
 the different workers.
 
 
 @node Bus feedback
 @node Bus feedback
-@subsection Bus-related feedback 
+@subsection Bus-related feedback
 
 
 TODO: ajouter STARPU_BUS_STATS
 TODO: ajouter STARPU_BUS_STATS
 
 
@@ -433,3 +454,108 @@ Emit statistics of actual execution vs theoretical upper bound. @var{integer}
 permits to choose between integer solving (which takes a long time but is
 permits to choose between integer solving (which takes a long time but is
 correct), and relaxed solving (which provides an approximate solution).
 correct), and relaxed solving (which provides an approximate solution).
 @end deftypefun
 @end deftypefun
+
+@node Memory feedback
+@section Memory feedback
+
+It is possible to enable memory statistics. To do so, you need to pass the option
+@code{--enable-memory-stats} when running configure. It is then
+possible to call the function @code{starpu_display_memory_stats()} to
+display statistics about the current data handles registered within StarPU.
+
+Moreover, statistics will be displayed at the end of the execution on
+data handles which have not been cleared out. This can be disabled by
+setting the environment variable @code{STARPU_MEMORY_STATS} to 0.
+
+For example, if you do not unregister data at the end of the complex
+example, you will get something similar to:
+
+@example
+$ STARPU_MEMORY_STATS=0 ./examples/interface/complex
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 78.00 + 78.00 i
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 45.00 + 12.00 i
+@end example
+
+@example
+$ STARPU_MEMORY_STATS=1 ./examples/interface/complex
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 78.00 + 78.00 i
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 45.00 + 12.00 i
+
+#---------------------
+Memory stats:
+#-------
+Data on Node #3
+#-----
+Data : 0x553ff40
+Size : 16
+
+#--
+Data access stats
+/!\ Work Underway
+Node #0
+	Direct access : 4
+	Loaded (Owner) : 0
+	Loaded (Shared) : 0
+	Invalidated (was Owner) : 0
+
+Node #3
+	Direct access : 0
+	Loaded (Owner) : 0
+	Loaded (Shared) : 1
+	Invalidated (was Owner) : 0
+
+#-----
+Data : 0x5544710
+Size : 16
+
+#--
+Data access stats
+/!\ Work Underway
+Node #0
+	Direct access : 2
+	Loaded (Owner) : 0
+	Loaded (Shared) : 1
+	Invalidated (was Owner) : 1
+
+Node #3
+	Direct access : 0
+	Loaded (Owner) : 1
+	Loaded (Shared) : 0
+	Invalidated (was Owner) : 0
+@end example
+
+@node Data statistics
+@section Data statistics
+
+Different data statistics can be displayed at the end of the execution
+of the application. To enable them, you need to pass the option
+@code{--enable-stats} when calling @code{configure}. When calling
+@code{starpu_shutdown()} various statistics will be displayed,
+execution, MSI cache statistics, allocation cache statistics, and data
+transfer statistics. The display can be disabled by setting the
+environment variable @code{STARPU_STATS} to 0.
+
+@example
+$ ./examples/cholesky/cholesky_tag
+Computation took (in ms)
+518.16
+Synthetic GFlops : 44.21
+#---------------------
+MSI cache stats :
+TOTAL MSI stats	hit 1622 (66.23 %)	miss 827 (33.77 %)
+...
+@end example
+
+@example
+$ STARPU_STATS=0 ./examples/cholesky/cholesky_tag
+Computation took (in ms)
+518.16
+Synthetic GFlops : 44.21
+@end example
+
+@c TODO: data transfer stats are similar to the ones displayed when
+@c setting STARPU_BUS_STATS

+ 83 - 5
doc/chapters/perf-optimization.texi

@@ -22,6 +22,7 @@ TODO: improve!
 * Profiling::
 * Profiling::
 * CUDA-specific optimizations::
 * CUDA-specific optimizations::
 * Performance debugging::
 * Performance debugging::
+* Simulated performance::
 @end menu
 @end menu
 
 
 Simply encapsulating application kernels into tasks already permits to
 Simply encapsulating application kernels into tasks already permits to
@@ -122,10 +123,14 @@ only when another task writes some value to the handle.
 Like any other runtime, StarPU has some overhead to manage tasks. Since
 Like any other runtime, StarPU has some overhead to manage tasks. Since
 it does smart scheduling and data management, that overhead is not always
 it does smart scheduling and data management, that overhead is not always
 neglectable. The order of magnitude of the overhead is typically a couple of
 neglectable. The order of magnitude of the overhead is typically a couple of
-microseconds. The amount of work that a task should do should thus be somewhat
+microseconds, which is actually quite smaller than the CUDA overhead itself. The
+amount of work that a task should do should thus be somewhat
 bigger, to make sure that the overhead becomes neglectible. The offline
 bigger, to make sure that the overhead becomes neglectible. The offline
 performance feedback can provide a measure of task length, which should thus be
 performance feedback can provide a measure of task length, which should thus be
-checked if bad performance are observed.
+checked if bad performance are observed. To get a grasp at the scalability
+possibility according to task size, one can run
+@code{tests/microbenchs/tasks_size_overhead.sh} which draws curves of the
+speedup of independent tasks of very small sizes.
 
 
 @node Task submission
 @node Task submission
 @section Task submission
 @section Task submission
@@ -265,7 +270,7 @@ A graph can be drawn by using the @code{starpu_perfmodel_plot}:
 
 
 @example
 @example
 $ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
 $ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
-98304 393216 1572864 
+98304 393216 1572864
 $ gnuplot starpu_starpu_dlu_lu_model_22.gp
 $ gnuplot starpu_starpu_dlu_lu_model_22.gp
 $ gv starpu_starpu_dlu_lu_model_22.eps
 $ gv starpu_starpu_dlu_lu_model_22.eps
 @end example
 @end example
@@ -394,12 +399,12 @@ with these manual measurements through @code{starpu_perfmodel_update_history}.
 @node Profiling
 @node Profiling
 @section Profiling
 @section Profiling
 
 
-A quick view of how many tasks each worker has executed can be obtained by setting 
+A quick view of how many tasks each worker has executed can be obtained by setting
 @code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
 @code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
 execution did happen on accelerators without penalizing performance with
 execution did happen on accelerators without penalizing performance with
 the profiling overhead.
 the profiling overhead.
 
 
-A quick view of how much data transfers have been issued can be obtained by setting 
+A quick view of how much data transfers have been issued can be obtained by setting
 @code{export STARPU_BUS_STATS=1} .
 @code{export STARPU_BUS_STATS=1} .
 
 
 More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
 More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
@@ -457,3 +462,76 @@ detailed in the next chapter. The various informations should be checked for.
   greedy algorithm which thus performs badly.
   greedy algorithm which thus performs badly.
 @end itemize
 @end itemize
 @end itemize
 @end itemize
+
+You can also use the Temanejo task debugger (see @ref{Task debugger}) to
+visualize the task graph more easily.
+
+@node Simulated performance
+@section Simulated performance
+
+StarPU can use Simgrid in order to simulate execution on an arbitrary
+platform. The idea is to first compile StarPU normally, and run the application,
+so as to automatically benchmark the bus and the codelets.
+
+@cartouche
+@smallexample
+$ ./configure && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.
+$ ...
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST PASSED
+@end smallexample
+@end cartouche
+
+Note that we force to use the dmda scheduler to generate performance
+models for the application. The application may need to be run several
+times before the model is calibrated.
+
+Then, recompile StarPU, passing @code{--enable-simgrid} to @code{./configure}, and re-run the
+application, specifying the requested number of devices:
+
+@cartouche
+@smallexample
+$ ./configure --enable-simgrid && make
+$ STARPU_SCHED=dmda STARPU_NCPU=12 STARPU_NCUDA=0 STARPU_NOPENCL=1 ./examples/matvecmult/matvecmult
+TEST FAILED !!!
+@end smallexample
+@end cartouche
+
+It is normal that the test fails: since the computation are not actually done
+(that is the whole point of simgrid), the result is wrong, of course.
+
+If the performance model is not calibrated enough, the following error
+message will be displayed
+
+@cartouche
+@smallexample
+$ STARPU_SCHED=dmda STARPU_NCPU=12 STARPU_NCUDA=0 STARPU_NOPENCL=1 ./examples/matvecmult/matvecmult
+[0.000000] [xbt_cfg/INFO] type in variable = 2
+[0.000000] [surf_workstation/INFO] surf_workstation_model_init_ptask_L07
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.
+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet matvecmult does not have a perfmodel, or is not calibrated enough
+$
+@end smallexample
+@end cartouche
+
+For now, only the number of cpus can be arbitrarily chosen. The number of CUDA
+and OpenCL devices have to be lower than the real number on the current machine.
+
+The Simgrid default stack size is small, to increase it use the
+parameter @code{--cfg=contexts/stack_size}, for example:
+
+@cartouche
+@smallexample
+$ STARPU_NCPU=12 STARPU_NCUDA=2 STARPU_NOPENCL=0 ./example --cfg=contexts/stack_size:8192
+[0.000000] [xbt_cfg/INFO] type in variable = 2
+[0.000000] [surf_workstation/INFO] surf_workstation_model_init_ptask_L07
+TEST FAILED !!!
+@end smallexample
+@end cartouche
+
+Note: of course, if the application uses @code{gettimeofday} to make its
+performance measurements, the real time will be used, which will be bogus. To
+get the simulated time, it has to use @code{starpu_timing_now} which returns the
+virtual timestamp in ms.

+ 6 - 7
doc/chapters/scaling-vector-example.texi

@@ -7,10 +7,10 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
-* Main application::            
-* CPU Kernel::                 
-* CUDA Kernel::                
-* OpenCL Kernel::              
+* Main application::
+* CPU Kernel::
+* CUDA Kernel::
+* OpenCL Kernel::
 @end menu
 @end menu
 
 
 @node Main application
 @node Main application
@@ -32,8 +32,8 @@
 @section OpenCL Kernel
 @section OpenCL Kernel
 
 
 @menu
 @menu
-* Invoking the kernel::         
-* Source of the kernel::        
+* Invoking the kernel::
+* Source of the kernel::
 @end menu
 @end menu
 
 
 @node Invoking the kernel
 @node Invoking the kernel
@@ -45,4 +45,3 @@
 @subsection Source of the kernel
 @subsection Source of the kernel
 
 
 @include chapters/vector_scal_opencl_codelet.texi
 @include chapters/vector_scal_opencl_codelet.texi
-

+ 2 - 3
doc/chapters/using.texi

@@ -7,8 +7,8 @@
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
 @menu
 @menu
-* Setting flags for compiling and linking applications::  
-* Running a basic StarPU application::  
+* Setting flags for compiling and linking applications::
+* Running a basic StarPU application::
 * Kernel threads started by StarPU::
 * Kernel threads started by StarPU::
 * Enabling OpenCL::
 * Enabling OpenCL::
 @end menu
 @end menu
@@ -111,4 +111,3 @@ so:
 @example
 @example
 % STARPU_NCUDA=2 ./application
 % STARPU_NCUDA=2 ./application
 @end example
 @end example
-

+ 1 - 1
doc/chapters/vector_scal_cpu.texi

@@ -51,7 +51,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
     float factor = *(float *) cl_arg;
     float factor = *(float *) cl_arg;
     FACTOR = _mm_set1_ps(factor);
     FACTOR = _mm_set1_ps(factor);
 
 
-    unsigned int i;	
+    unsigned int i;
     for (i = 0; i < n_iterations; i++)
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
         VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
 
 

+ 7 - 2
doc/starpu.texi

@@ -82,12 +82,13 @@ was last updated on @value{UPDATED}.
 * StarPU Basic API::            	The Basic API to use StarPU
 * StarPU Basic API::            	The Basic API to use StarPU
 * StarPU Advanced API::         	Advanced use of StarPU
 * StarPU Advanced API::         	Advanced use of StarPU
 * Configuring StarPU::          	How to configure StarPU
 * Configuring StarPU::          	How to configure StarPU
-* Full source code for the 'Scaling a Vector' example::  
+* Full source code for the 'Scaling a Vector' example::
 * GNU Free Documentation License::  How you can copy and share this manual.
 * GNU Free Documentation License::  How you can copy and share this manual.
 
 
 * Concept Index::               Index of programming concepts.
 * Concept Index::               Index of programming concepts.
 * Function Index::              Index of C functions.
 * Function Index::              Index of C functions.
-* Datatype Index::              Index of C datatypes
+* Datatype Index::              Index of C datatypes.
+* Configuration Index::         Index of configuration options.
 @end menu
 @end menu
 
 
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
@@ -264,4 +265,8 @@ was last updated on @value{UPDATED}.
 @unnumbered Datatype Index
 @unnumbered Datatype Index
 @printindex tp
 @printindex tp
 
 
+@node Configuration Index
+@unnumbered Configuration Index
+@printindex vr
+
 @bye
 @bye

+ 2 - 3
examples/basic_examples/vector_scal.c

@@ -23,7 +23,6 @@
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
  */
 
 
-#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdio.h>
@@ -42,13 +41,13 @@ extern void scal_opencl_func(void *buffers[], void *_args);
 static struct starpu_perfmodel vector_scal_model =
 static struct starpu_perfmodel vector_scal_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale"
+	.symbol = "vector_scal"
 };
 };
 
 
 static struct starpu_perfmodel vector_scal_power_model =
 static struct starpu_perfmodel vector_scal_power_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_power"
+	.symbol = "vector_scal_power"
 };
 };
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =

+ 1 - 1
examples/basic_examples/vector_scal_c.c

@@ -35,7 +35,7 @@ extern void scal_cuda_func(void *buffers[], void *_args);
 static struct starpu_perfmodel vector_scal_model =
 static struct starpu_perfmodel vector_scal_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_model"
+	.symbol = "vector_scal_model"
 };
 };
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =

+ 1 - 1
examples/basic_examples/vector_scal_cpu.c

@@ -15,7 +15,7 @@
  */
  */
 
 
 /*
 /*
- * This example complements vector_scale.c: here we implement a CPU version.
+ * This example complements vector_scal.c: here we implement a CPU version.
  */
  */
 
 
 #include "vector_scal_cpu_template.h"
 #include "vector_scal_cpu_template.h"

+ 1 - 1
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -15,7 +15,7 @@
  */
  */
 
 
 /*
 /*
- * This example complements vector_scale.c: here we implement a CPU version,
+ * This example complements vector_scal.c: here we implement a CPU version,
  * meant to be compiled by icc.
  * meant to be compiled by icc.
  */
  */
 
 

+ 1 - 1
examples/basic_examples/vector_scal_cpu_template.h

@@ -15,7 +15,7 @@
  */
  */
 
 
 /*
 /*
- * This example complements vector_scale.c: here we implement a CPU version.
+ * This example complements vector_scal.c: here we implement a CPU version.
  */
  */
 
 
 #ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__
 #ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__

+ 1 - 1
examples/basic_examples/vector_scal_cuda.cu

@@ -16,7 +16,7 @@
  */
  */
 
 
 /*
 /*
- * This example complements vector_scale.c: here we implement a CUDA version.
+ * This example complements vector_scal.c: here we implement a CUDA version.
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>

+ 1 - 1
examples/basic_examples/vector_scal_opencl.c

@@ -17,7 +17,7 @@
  */
  */
 
 
 /*
 /*
- * This example complements vector_scale.c: here we implement a OpenCL version.
+ * This example complements vector_scal.c: here we implement a OpenCL version.
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>

+ 16 - 4
examples/cholesky/cholesky.h

@@ -58,10 +58,12 @@
 static unsigned size = 4*1024;
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
 static unsigned nbigblocks = 8;
 static unsigned nbigblocks = 8;
-static unsigned pinned = 0;
+static unsigned pinned = 1;
 static unsigned noprio = 0;
 static unsigned noprio = 0;
 static unsigned check = 0;
 static unsigned check = 0;
 static unsigned bound = 0;
 static unsigned bound = 0;
+static unsigned bound_deps = 0;
+static unsigned bound_lp = 0;
 static unsigned with_ctxs = 0;
 static unsigned with_ctxs = 0;
 static unsigned with_noctxs = 0;
 static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
 static unsigned chole1 = 0;
@@ -127,9 +129,9 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 
-		if (strcmp(argv[i], "-pin") == 0)
+		if (strcmp(argv[i], "-no-pin") == 0)
 		{
 		{
-			pinned = 1;
+			pinned = 0;
 		}
 		}
 
 
 		if (strcmp(argv[i], "-no-prio") == 0)
 		if (strcmp(argv[i], "-no-prio") == 0)
@@ -142,6 +144,16 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			bound = 1;
 			bound = 1;
 		}
 		}
 
 
+		if (strcmp(argv[i], "-bound-lp") == 0)
+		{
+			bound_lp = 1;
+		}
+
+		if (strcmp(argv[i], "-bound-deps") == 0)
+		{
+			bound_deps = 1;
+		}
+
 		if (strcmp(argv[i], "-check") == 0)
 		if (strcmp(argv[i], "-check") == 0)
 		{
 		{
 			check = 1;
 			check = 1;
@@ -149,7 +161,7 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
 
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
 		{
 		{
-			fprintf(stderr,"usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
+			fprintf(stderr,"usage : %s [-size size] [-nblocks nblocks] [-no-pin] [-no-prio] [-bound] [-bound-deps] [-bound-lp] [-check]\n", argv[0]);
 			fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
 			fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
 		}
 		}
 	}
 	}

+ 10 - 6
examples/cholesky/cholesky_grain_tag.c

@@ -288,6 +288,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
+#ifndef STARPU_SIMGRID
 	if (pinned)
 	if (pinned)
 	{
 	{
 		starpu_malloc((void **)A, dim*dim*sizeof(float));
 		starpu_malloc((void **)A, dim*dim*sizeof(float));
@@ -296,21 +297,22 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 	{
 	{
 		*A = malloc(dim*dim*sizeof(float));
 		*A = malloc(dim*dim*sizeof(float));
 	}
 	}
+#endif
 }
 }
 
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
 {
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 	int ret;
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 
 	ret = cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
 	ret = cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
 
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
@@ -345,9 +347,10 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	float *mat;
+	float *mat = NULL;
 	initialize_system(&mat, size, pinned);
 	initialize_system(&mat, size, pinned);
 
 
+#ifndef STARPU_SIMGRID
 	unsigned i,j;
 	unsigned i,j;
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
 	{
 	{
@@ -357,6 +360,7 @@ int main(int argc, char **argv)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
+#endif
 
 
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT

+ 18 - 10
examples/cholesky/cholesky_implicit.c

@@ -75,17 +75,17 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 {
 	int ret;
 	int ret;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 
 	unsigned i,j,k;
 	unsigned i,j,k;
 
 
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 
 	if (bound)
 	if (bound)
-		starpu_bound_start(0, 0);
+		starpu_bound_start(bound_deps, 0);
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
@@ -135,10 +135,10 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	if (bound)
 	if (bound)
 		starpu_bound_stop();
 		starpu_bound_stop();
 
 
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
 
 
+	//double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	unsigned long n = starpu_matrix_get_nx(dataA);
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 
 	double flop = (1.0f*n*n*n)/3.0f;
 	double flop = (1.0f*n*n*n)/3.0f;
@@ -151,6 +151,11 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		FPRINTF(stdout, "%2.2f\n", timing/1000);
 		FPRINTF(stdout, "%2.2f\n", timing/1000);
 	
 	
 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound_lp)
+		{
+			FILE *f = fopen("cholesky.lp", "w");
+			starpu_bound_print_lp(f);
+		}
 		if (bound)
 		if (bound)
 		{
 		{
 			double res;
 			double res;
@@ -194,10 +199,11 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 static void execute_cholesky(unsigned size, unsigned nblocks)
 static void execute_cholesky(unsigned size, unsigned nblocks)
 {
 {
 	int ret;
 	int ret;
-	float *mat;
-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
-
+	float *mat = NULL;
 	unsigned i,j;
 	unsigned i,j;
+
+#ifndef STARPU_SIMGRID
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
 	{
 	{
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
@@ -206,6 +212,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
+#endif
 
 
 /* #define PRINT_OUTPUT */
 /* #define PRINT_OUTPUT */
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
@@ -345,6 +352,7 @@ int main(int argc, char **argv)
 		execute_cholesky(size, nblocks);
 		execute_cholesky(size, nblocks);
 
 
 	starpu_helper_cublas_shutdown();
 	starpu_helper_cublas_shutdown();
+	starpu_free(mat);
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return ret;
 	return ret;

+ 10 - 6
examples/cholesky/cholesky_tag.c

@@ -175,15 +175,15 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
 
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 
 	struct starpu_task *entry_task = NULL;
 	struct starpu_task *entry_task = NULL;
 
 
 	/* create all the DAG nodes */
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 	unsigned i,j,k;
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
@@ -230,10 +230,10 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	starpu_data_unpartition(dataA, 0);
 	starpu_data_unpartition(dataA, 0);
 
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 
 
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
@@ -254,6 +254,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
 
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
+#ifndef STARPU_SIMGRID
 	if (pinned)
 	if (pinned)
 	{
 	{
 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
@@ -262,6 +263,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 	{
 	{
 		*A = malloc(dim*dim*sizeof(float));
 		*A = malloc(dim*dim*sizeof(float));
 	}
 	}
+#endif
 	return 0;
 	return 0;
 }
 }
 
 
@@ -318,10 +320,11 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	float *mat;
+	float *mat = NULL;
 	int ret = initialize_system(&mat, size, pinned);
 	int ret = initialize_system(&mat, size, pinned);
 	if (ret) return ret;
 	if (ret) return ret;
 
 
+#ifndef STARPU_SIMGRID
 	unsigned i,j;
 	unsigned i,j;
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
 	{
 	{
@@ -331,6 +334,7 @@ int main(int argc, char **argv)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
+#endif
 
 
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT

+ 8 - 6
examples/cholesky/cholesky_tile_tag.c

@@ -195,8 +195,8 @@ static int cholesky_no_stride(void)
 {
 {
 	int ret;
 	int ret;
 
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 
 	struct starpu_task *entry_task = NULL;
 	struct starpu_task *entry_task = NULL;
 
 
@@ -234,7 +234,7 @@ static int cholesky_no_stride(void)
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	ret = starpu_task_submit(entry_task);
 	if (ret == -ENODEV) return 77;
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -242,9 +242,9 @@ static int cholesky_no_stride(void)
 	/* stall the application until the end of computations */
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 	starpu_tag_wait(TAG11(nblocks-1));
 
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
@@ -257,7 +257,6 @@ static int cholesky_no_stride(void)
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	unsigned x, y;
 	unsigned x, y;
-	unsigned i, j;
 	int ret;
 	int ret;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
@@ -275,6 +274,7 @@ int main(int argc, char **argv)
 
 
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
+#ifndef STARPU_SIMGRID
 	for (y = 0; y < nblocks; y++)
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	for (x = 0; x < nblocks; x++)
 	{
 	{
@@ -297,6 +297,7 @@ int main(int argc, char **argv)
 	for (x = 0; x < nblocks; x++)
 	for (x = 0; x < nblocks; x++)
 	if (x <= y)
 	if (x <= y)
 	{
 	{
+		unsigned i, j;
 		for (i = 0; i < BLOCKSIZE; i++)
 		for (i = 0; i < BLOCKSIZE; i++)
 		for (j = 0; j < BLOCKSIZE; j++)
 		for (j = 0; j < BLOCKSIZE; j++)
 		{
 		{
@@ -308,6 +309,7 @@ int main(int argc, char **argv)
 				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
 				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
 		}
 		}
 	}
 	}
+#endif
 
 
 	for (y = 0; y < nblocks; y++)
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	for (x = 0; x < nblocks; x++)

+ 37 - 107
examples/filters/custom_mf/custom_interface.c

@@ -157,125 +157,55 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 	struct custom_data_interface *custom_interface;
 	struct custom_data_interface *custom_interface;
 	custom_interface = (struct custom_data_interface *) data_interface;
 	custom_interface = (struct custom_data_interface *) data_interface;
 
 
-	switch(starpu_node_get_kind(node))
-	{
-	case STARPU_CPU_RAM:
-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		custom_interface->cpu_ptr = (void*) malloc(size);
-		if (!custom_interface->cpu_ptr)
-			return -ENOMEM;
+	size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+	custom_interface->cpu_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->cpu_ptr)
+		goto fail_cpu;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-		custom_interface->cuda_ptr = (void *) malloc(size);
-		if (!custom_interface->cuda_ptr)
-		{
-			free(custom_interface->cpu_ptr);
-			custom_interface->cpu_ptr = NULL;
-			return -ENOMEM;
-		}
-#endif /* !STARPU_USE_CUDA */
+	custom_interface->cuda_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->cuda_ptr)
+		goto fail_cuda;
+#endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-		custom_interface->opencl_ptr = malloc(size);
-		if (custom_interface->cuda_ptr == NULL)
-		{
-			free(custom_interface->cpu_ptr);
-#ifdef STARPU_USE_CUDA
-			free(custom_interface->cuda_ptr);
-#endif /* !STARPU_USE_CUDA */
-			return -ENOMEM;
-		}
-#endif /* !STARPU_USE_OPENCL */
-			
-		break;
+	custom_interface->opencl_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->opencl_ptr)
+		goto fail_opencl;
+#endif
+
+	return size
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-	{
-		cudaError_t err;
-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		err = cudaMalloc(&custom_interface->cuda_ptr, size);
-		if (err != cudaSuccess)
-			return -ENOMEM;
-
-		err = cudaMalloc(&custom_interface->cpu_ptr, size);
-		if (err != cudaSuccess)
-		{
-			cudaFree(custom_interface->cuda_ptr);
-			return -ENOMEM;
-		}
-		break;
-	}
+		+size
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
-	{
-		cl_int err;
-		cl_mem memory;
-		ssize_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		err = starpu_opencl_allocate_memory(&memory, size, CL_MEM_READ_WRITE);
-		if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-
-		custom_interface->opencl_ptr = memory;
-
-		break;
-	}
-#endif /* !STARPU_USE_OPENCL */
-	default:
-		assert(0);
-	}
-
-	/* XXX We may want to return cpu_size + cuda_size + ... */
-	return size;
+		+size
+#endif
+		;
+#ifdef STARPU_USE_OPENCL
+fail_opencl:
+#ifdef STARPU_USE_CUDA
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+#endif
+#endif
+#ifdef STARPU_USE_CUDA
+fail_cuda:
+#endif
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
+fail_cpu:
+	return -ENOMEM;
 }
 }
 
 
 static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
 static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
 {
 {
-	struct custom_data_interface *custom_interface;
-	custom_interface = (struct custom_data_interface *) data_interface;
+	struct custom_data_interface *custom_interface = (struct custom_data_interface *) data_interface;
+	size_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
 
 
-	switch(starpu_node_get_kind(node))
-	{
-	case STARPU_CPU_RAM:
-		if (custom_interface->cpu_ptr != NULL)
-		{
-			free(custom_interface->cpu_ptr);
-			custom_interface->cpu_ptr = NULL;
-		}
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-		if (custom_interface->cuda_ptr != NULL)
-		{
-			free(custom_interface->cuda_ptr);
-			custom_interface->cuda_ptr = NULL;
-		}
-#endif /* !STARPU_USE_CUDA */
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+#endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-		if (custom_interface->opencl_ptr != NULL)
-		{
-			free(custom_interface->opencl_ptr);
-			custom_interface->opencl_ptr = NULL;
-		}
-#endif /* !STARPU_USE_OPENCL */
-		break;
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		if (custom_interface->cpu_ptr != NULL)
-		{
-			cudaError_t err;
-			err = cudaFree(custom_interface->cpu_ptr);
-			if (err != cudaSuccess)
-				fprintf(stderr, "cudaFree failed...\n");
-		}
-		if (custom_interface->cuda_ptr != NULL)
-		{
-			cudaError_t err;
-			err = cudaFree(custom_interface->cuda_ptr);
-			if (err != cudaSuccess)
-				fprintf(stderr, "cudaFree failed...\n");
-		}
-		break;
-#endif /* !STARPU_USE_CUDA */
-	default:
-		assert(0);
-	}
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
+#endif
 }
 }
 
 
 static void*
 static void*

+ 34 - 20
examples/interface/complex.c

@@ -18,6 +18,8 @@
 #include "complex_interface.h"
 #include "complex_interface.h"
 #include "complex_codelet.h"
 #include "complex_codelet.h"
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 {
        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
@@ -37,6 +39,8 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
                /* Old card does not support doubles */
                /* Old card does not support doubles */
                return 0;
                return 0;
        }
        }
+#else
+       return 1;
 #endif
 #endif
 }
 }
 
 
@@ -57,7 +61,8 @@ struct starpu_codelet cl_copy =
 #endif
 #endif
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W},
 	.modes = {STARPU_R, STARPU_W},
-	.can_execute = can_execute
+	.can_execute = can_execute,
+	.name = "cl_copy"
 };
 };
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -75,6 +80,9 @@ int main(int argc, char **argv)
 	double copy_real = 78.0;
 	double copy_real = 78.0;
 	double copy_imaginary = 78.0;
 	double copy_imaginary = 78.0;
 
 
+	int compare;
+	int *compare_ptr = &compare;
+
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return 77;
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -88,60 +96,66 @@ int main(int argc, char **argv)
 	starpu_complex_data_register(&handle2, 0, &copy_real, &copy_imaginary, 1);
 	starpu_complex_data_register(&handle2, 0, &copy_real, &copy_imaginary, 1);
 
 
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	ret = starpu_insert_task(&cl_compare,
 	ret = starpu_insert_task(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_R, handle2,
+				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 				 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	starpu_task_wait_for_all();
+	if (compare != 0)
+	{
+	     FPRINTF(stderr, "Complex numbers should NOT be similar\n");
+	     goto end;
+	}
 
 
 	ret = starpu_insert_task(&cl_copy,
 	ret = starpu_insert_task(&cl_copy,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_W, handle2,
 				 STARPU_W, handle2,
 				 0);
 				 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	ret = starpu_insert_task(&cl_compare,
 	ret = starpu_insert_task(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_R, handle2,
+				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 				 0);
-	if (ret == -ENODEV) goto enodev;
+	if (ret == -ENODEV) goto end;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
-#warning get the comparison result and return it as the application return code
-
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
-#ifdef STARPU_USE_OPENCL
-        ret = starpu_opencl_unload_opencl(&opencl_program);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
-#endif
-	starpu_shutdown();
-	return 0;
+	if (compare != 1)
+	{
+	     FPRINTF(stderr, "Complex numbers should be similar\n");
+	}
 
 
-enodev:
+end:
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        ret = starpu_opencl_unload_opencl(&opencl_program);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+	{
+	     int ret2 = starpu_opencl_unload_opencl(&opencl_program);
+	     STARPU_CHECK_RETURN_VALUE(ret2, "starpu_opencl_unload_opencl");
+	}
 #endif
 #endif
 	starpu_data_unregister(handle1);
 	starpu_data_unregister(handle1);
 	starpu_data_unregister(handle2);
 	starpu_data_unregister(handle2);
 	starpu_shutdown();
 	starpu_shutdown();
-	return 77;
+	if (ret == -ENODEV) return 77; else return !compare;
 }
 }

+ 10 - 6
examples/interface/complex_codelet.h

@@ -20,7 +20,7 @@
 #ifndef __COMPLEX_CODELET_H
 #ifndef __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 
 
-void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+void compare_complex_codelet(void *descr[], void *_args)
 {
 {
 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
 	double *real1 = STARPU_COMPLEX_GET_REAL(descr[0]);
 	double *real1 = STARPU_COMPLEX_GET_REAL(descr[0]);
@@ -30,7 +30,10 @@ void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 	double *real2 = STARPU_COMPLEX_GET_REAL(descr[1]);
 	double *real2 = STARPU_COMPLEX_GET_REAL(descr[1]);
 	double *imaginary2 = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
 	double *imaginary2 = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
 
 
-	int compare = (nx1 == nx2);
+	int *compare;
+
+	starpu_codelet_unpack_args(_args, &compare);
+	*compare = (nx1 == nx2);
 	if (nx1 == nx2)
 	if (nx1 == nx2)
 	{
 	{
 		int i;
 		int i;
@@ -38,19 +41,19 @@ void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 		{
 		{
 			if (real1[i] != real2[i] || imaginary1[i] != imaginary2[i])
 			if (real1[i] != real2[i] || imaginary1[i] != imaginary2[i])
 			{
 			{
-				compare = 0;
+				*compare = 0;
 				break;
 				break;
 			}
 			}
 		}
 		}
 	}
 	}
-	fprintf(stderr, "Complex numbers are%s similar\n", compare==0 ? " NOT" : "");
 }
 }
 
 
 struct starpu_codelet cl_compare =
 struct starpu_codelet cl_compare =
 {
 {
 	.cpu_funcs = {compare_complex_codelet, NULL},
 	.cpu_funcs = {compare_complex_codelet, NULL},
 	.nbuffers = 2,
 	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_R}
+	.modes = {STARPU_R, STARPU_R},
+	.name = "cl_compare"
 };
 };
 
 
 void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
@@ -70,7 +73,8 @@ struct starpu_codelet cl_display =
 {
 {
 	.cpu_funcs = {display_complex_codelet, NULL},
 	.cpu_funcs = {display_complex_codelet, NULL},
 	.nbuffers = 1,
 	.nbuffers = 1,
-	.modes = {STARPU_R}
+	.modes = {STARPU_R},
+	.name = "cl_display"
 };
 };
 
 
 #endif /* __COMPLEX_CODELET_H */
 #endif /* __COMPLEX_CODELET_H */

+ 21 - 73
examples/interface/complex_interface.c

@@ -62,89 +62,36 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 {
 {
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
 
 
-	unsigned fail = 0;
 	double *addr_real = 0;
 	double *addr_real = 0;
 	double *addr_imaginary = 0;
 	double *addr_imaginary = 0;
 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
 
 
-	enum starpu_node_kind kind = starpu_node_get_kind(node);
-
-	switch(kind)
-	{
-		case STARPU_CPU_RAM:
-			addr_real = malloc(requested_memory);
-			addr_imaginary = malloc(requested_memory);
-			if (!addr_real || !addr_imaginary)
-				fail = 1;
-			break;
-#ifdef STARPU_USE_CUDA
-		case STARPU_CUDA_RAM:
-		{
-			cudaError_t status;
-			status = cudaMalloc((void **)&addr_real, requested_memory);
-			if (!addr_real || (status != cudaSuccess))
-			{
-				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
-					STARPU_CUDA_REPORT_ERROR(status);
-
-				fail = 1;
-			}
-			else
-			{
-				status = cudaMalloc((void **)&addr_imaginary, requested_memory);
-				if (!addr_imaginary || (status != cudaSuccess))
-				{
-					if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
-						STARPU_CUDA_REPORT_ERROR(status);
-
-					fail = 1;
-				}
-			}
-
-			break;
-		}
-#endif
-#ifdef STARPU_USE_OPENCL
-	        case STARPU_OPENCL_RAM:
-		{
-			int ret;
-			cl_mem real, imaginary;
-			ret = starpu_opencl_allocate_memory(&real, requested_memory, CL_MEM_READ_WRITE);
-			if (ret != CL_SUCCESS)
-			{
-				fail = 1;
-				break;
-			}
-			else
-			{
-				addr_real = (double *) real;
-			}
-
-			ret = starpu_opencl_allocate_memory(&imaginary, requested_memory, CL_MEM_READ_WRITE);
-			if (ret != CL_SUCCESS)
-			{
-				fail = 1;
-				break;
-			}
-			else
-			{
-				addr_imaginary = (double *) imaginary;
-			}
-			break;
-		}
-#endif
-		default:
-			STARPU_ABORT();
-	}
-
-	if (fail)
-		return -ENOMEM;
+	addr_real = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
+	if (!addr_real)
+		goto fail_real;
+	addr_imaginary = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
+	if (!addr_imaginary)
+		goto fail_imaginary;
 
 
 	/* update the data properly in consequence */
 	/* update the data properly in consequence */
 	complex_interface->real = addr_real;
 	complex_interface->real = addr_real;
 	complex_interface->imaginary = addr_imaginary;
 	complex_interface->imaginary = addr_imaginary;
 
 
 	return 2*requested_memory;
 	return 2*requested_memory;
+
+fail_imaginary:
+	starpu_free_buffer_on_node(node, (uintptr_t) addr_real, requested_memory);
+fail_real:
+	return -ENOMEM;
+}
+
+static void complex_free_data_on_node(void *data_interface, uint32_t node)
+{
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
+	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
+
+	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
+	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
 }
 }
 
 
 static size_t complex_get_size(starpu_data_handle_t handle)
 static size_t complex_get_size(starpu_data_handle_t handle)
@@ -338,6 +285,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 {
 {
 	.register_data_handle = complex_register_data_handle,
 	.register_data_handle = complex_register_data_handle,
 	.allocate_data_on_node = complex_allocate_data_on_node,
 	.allocate_data_on_node = complex_allocate_data_on_node,
+	.free_data_on_node = complex_free_data_on_node,
 	.copy_methods = &complex_copy_methods,
 	.copy_methods = &complex_copy_methods,
 	.get_size = complex_get_size,
 	.get_size = complex_get_size,
 	.footprint = complex_footprint,
 	.footprint = complex_footprint,

+ 19 - 11
examples/matvecmult/matvecmult.c

@@ -121,9 +121,27 @@ int compareL2fe(const float* reference, const float* data, const unsigned int le
     return error < epsilon ? 0 : 1;
     return error < epsilon ? 0 : 1;
 }
 }
 
 
+static struct starpu_perfmodel starpu_matvecmult_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "matvecmult"
+};
+
+static struct starpu_codelet cl =
+{
+	.where = STARPU_OPENCL,
+#ifdef STARPU_USE_OPENCL
+        .opencl_funcs[0] = opencl_codelet,
+#endif
+        .nbuffers = 3,
+	.modes[0] = STARPU_R,
+	.modes[1] = STARPU_R,
+	.modes[2] = STARPU_RW,
+	.model = &starpu_matvecmult_model
+};
+
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	struct starpu_codelet cl = {};
 
 
 	struct starpu_conf conf;
 	struct starpu_conf conf;
 	
 	
@@ -179,16 +197,6 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 #endif
 
 
-	cl.where = STARPU_OPENCL;
-#ifdef STARPU_USE_OPENCL
-        cl.opencl_funcs[0] = opencl_codelet;
-#endif
-        cl.nbuffers = 3;
-	cl.modes[0] = STARPU_R;
-	cl.modes[1] = STARPU_R;
-	cl.modes[2] = STARPU_RW;
-        cl.model = NULL;
-
         struct starpu_task *task = starpu_task_create();
         struct starpu_task *task = starpu_task_create();
         task->cl = &cl;
         task->cl = &cl;
         task->callback_func = NULL;
         task->callback_func = NULL;

+ 7 - 5
examples/mult/xgemm.c

@@ -76,6 +76,7 @@ static void init_problem_data(void)
 {
 {
 	unsigned i,j;
 	unsigned i,j;
 
 
+#ifndef STARPU_SIMGRID
 	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
 	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
 	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
 	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
 	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
 	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
@@ -104,6 +105,7 @@ static void init_problem_data(void)
 			C[j+i*ydim] = (TYPE)(0);
 			C[j+i*ydim] = (TYPE)(0);
 		}
 		}
 	}
 	}
+#endif
 }
 }
 
 
 static void partition_mult_data(void)
 static void partition_mult_data(void)
@@ -281,8 +283,7 @@ static void parse_args(int argc, char **argv)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	struct timeval start;
-	struct timeval end;
+	double start, end;
 	int ret;
 	int ret;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
@@ -301,7 +302,7 @@ int main(int argc, char **argv)
 	init_problem_data();
 	init_problem_data();
 	partition_mult_data();
 	partition_mult_data();
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 
 	unsigned x, y, iter;
 	unsigned x, y, iter;
 	for (iter = 0; iter < niter; iter++)
 	for (iter = 0; iter < niter; iter++)
@@ -330,8 +331,9 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 
 
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+
+	double timing = end - start;
 
 
 	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
 	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
 
 

+ 1 - 1
examples/openmp/vector_scal.c

@@ -51,7 +51,7 @@ void scal_cpu_func(void *buffers[], void *_args)
 static struct starpu_perfmodel vector_scal_model =
 static struct starpu_perfmodel vector_scal_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_parallel"
+	.symbol = "vector_scal_parallel"
 };
 };
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =

+ 1 - 1
examples/spmd/vector_scal_spmd.c

@@ -75,7 +75,7 @@ void scal_cpu_func(void *buffers[], void *_args)
 static struct starpu_perfmodel vector_scal_model =
 static struct starpu_perfmodel vector_scal_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_parallel"
+	.symbol = "vector_scal_parallel"
 };
 };
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =

+ 5 - 3
examples/stencil/stencil.c

@@ -152,7 +152,7 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
  */
  */
 
 
 struct timeval start;
 struct timeval start;
-struct timeval end;
+double begin, end;
 double timing; 
 double timing; 
 
 
 void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
@@ -242,11 +242,13 @@ int main(int argc, char **argv)
 
 
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 
 
+	begin = starpu_timing_now();
+
 	starpu_tag_notify_from_apps(TAG_INIT_TASK);
 	starpu_tag_notify_from_apps(TAG_INIT_TASK);
 
 
 	wait_end_tasks(rank);
 	wait_end_tasks(rank);
 
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 
 #ifdef STARPU_USE_MPI
 #ifdef STARPU_USE_MPI
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
@@ -264,7 +266,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 	/* timing in us */
 	/* timing in us */
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing = end - begin;
 
 
 	double min_timing = timing;
 	double min_timing = timing;
 	double max_timing = timing;
 	double max_timing = timing;

+ 6 - 2
include/starpu.h

@@ -66,6 +66,10 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
+#ifdef STARPU_SIMGRID
+#define main starpu_main
+#endif
+
 enum starpu_archtype
 enum starpu_archtype
 {
 {
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
@@ -140,10 +144,10 @@ struct starpu_conf
 	int disable_asynchronous_copy;
 	int disable_asynchronous_copy;
 
 
         /* indicate if asynchronous copies to CUDA devices should be disabled */
         /* indicate if asynchronous copies to CUDA devices should be disabled */
-	int disable_cuda_asynchronous_copy;
+	int disable_asynchronous_cuda_copy;
 
 
         /* indicate if asynchronous copies to OpenCL devices should be disabled */
         /* indicate if asynchronous copies to OpenCL devices should be disabled */
-	int disable_opencl_asynchronous_copy;
+	int disable_asynchronous_opencl_copy;
 
 
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
 	unsigned *cuda_opengl_interoperability;
 	unsigned *cuda_opengl_interoperability;

+ 4 - 0
include/starpu_config.h.in

@@ -26,6 +26,8 @@
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_GORDON
 #undef STARPU_USE_GORDON
 
 
+#undef STARPU_SIMGRID
+
 #undef STARPU_HAVE_ICC
 #undef STARPU_HAVE_ICC
 
 
 #undef STARPU_USE_MPI
 #undef STARPU_USE_MPI
@@ -93,5 +95,7 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_QUICK_CHECK
 #undef STARPU_QUICK_CHECK
 #undef STARPU_USE_DRAND48
 #undef STARPU_USE_DRAND48
 #undef STARPU_USE_ERAND48_R
 #undef STARPU_USE_ERAND48_R
+#undef STARPU_HAVE_NEARBYINTF
+#undef STARPU_HAVE_RINTF
 
 
 #endif
 #endif

+ 1 - 0
include/starpu_data.h

@@ -87,6 +87,7 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
 
 int starpu_malloc(void **A, size_t dim);
 int starpu_malloc(void **A, size_t dim);
 int starpu_free(void *A);
 int starpu_free(void *A);
+void starpu_memory_display_stats();
 
 
 /* XXX These macros are provided to avoid breaking old codes. But consider
 /* XXX These macros are provided to avoid breaking old codes. But consider
  * these function names as deprecated. */
  * these function names as deprecated. */

+ 5 - 0
include/starpu_data_interfaces.h

@@ -144,6 +144,11 @@ int starpu_data_interface_get_next_id(void);
 void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, void *data_interface, struct starpu_data_interface_ops *ops);
 void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, void *data_interface, struct starpu_data_interface_ops *ops);
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
 
 
+/* Allocate SIZE bytes on node NODE */
+uintptr_t starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size);
+/* Free ADDR on node NODE */
+void starpu_free_buffer_on_node(uint32_t dst_node, uintptr_t addr, size_t size);
+
 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
  * interface does not support this operation or data for this handle is not
  * interface does not support this operation or data for this handle is not
  * allocated on that node. */
  * allocated on that node. */

+ 1 - 1
include/starpu_perfmodel.h

@@ -188,7 +188,7 @@ struct starpu_perfmodel
 	unsigned is_loaded;
 	unsigned is_loaded;
 	unsigned benchmarking;
 	unsigned benchmarking;
 
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(STARPU_SIMGRID)
 	void *model_rwlock;
 	void *model_rwlock;
 #else
 #else
 	pthread_rwlock_t model_rwlock;
 	pthread_rwlock_t model_rwlock;

+ 5 - 5
include/starpu_scheduler.h

@@ -272,15 +272,15 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
  *	Performance predictions
  *	Performance predictions
  */
  */
 
 
-/* Return the current date */
+/* Return the current date in µs */
 double starpu_timing_now(void);
 double starpu_timing_now(void);
-/* Returns expected task duration in µs */
+/* Returns expected task duration in µs */
 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
 /* Returns an estimated speedup factor relative to CPU speed */
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
-/* Returns expected data transfer time in µs */
+/* Returns expected data transfer time in µs */
 double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct starpu_task *task);
 double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct starpu_task *task);
-/* Predict the transfer time (in µs) to move a handle to a memory node */
+/* Predict the transfer time (in µs) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
 /* Returns expected power consumption in J */
 /* Returns expected power consumption in J */
 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
@@ -288,7 +288,7 @@ double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_arc
 double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Return the expected duration of the entire task bundle in µs. */
 /* Return the expected duration of the entire task bundle in µs. */
 double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
 double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
-/* Return the time (in µs) expected to transfer all data used within the bundle */
+/* Return the time (in µs) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
 /* Return the expected power consumption of the entire task bundle in J. */
 double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
 double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);

+ 3 - 3
include/starpu_util.h

@@ -70,14 +70,14 @@ extern "C"
 
 
 #ifdef STARPU_NO_ASSERT
 #ifdef STARPU_NO_ASSERT
 #define STARPU_ASSERT(x)		do { (void) (x);} while(0)
 #define STARPU_ASSERT(x)		do { (void) (x);} while(0)
-#define STARPU_ASSERT_MSG(x, msg)	do { (void) (x);} while(0)
+#define STARPU_ASSERT_MSG(x, msg, ...)	do { (void) (x);} while(0)
 #else
 #else
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) *(int*)NULL = 0; } while(0)
 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) *(int*)NULL = 0; } while(0)
-#    define STARPU_ASSERT_MSG(x, msg)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] %s\n", __func__, msg); *(int*)NULL = 0; }} while(0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __func__, ## __VA_ARGS__); *(int*)NULL = 0; }} while(0)
 #  else
 #  else
 #    define STARPU_ASSERT(x)		assert(x)
 #    define STARPU_ASSERT(x)		assert(x)
-#    define STARPU_ASSERT_MSG(x, msg)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] %s\n", __func__, msg); } ; assert(x); } while(0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __func__, ## __VA_ARGS__); } ; assert(x); } while(0)
 
 
 #  endif
 #  endif
 #endif
 #endif

+ 21 - 9
mpi/examples/Makefile.am

@@ -17,16 +17,28 @@
 CC=$(MPICC)
 CC=$(MPICC)
 CCLD=$(MPICC)
 CCLD=$(MPICC)
 
 
-if STARPU_MPI_CHECK
+if STARPU_HAVE_WINDOWS
+LOADER_BIN		=
+else
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+endif
+
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-LOG_COMPILER	 	=	$(MPIEXEC) -np 2
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
 endif
 endif
-TESTS			=	$(check_PROGRAMS)
+
+if STARPU_MPI_CHECK
+TESTS			=	$(starpu_mpi_EXAMPLES)
 endif
 endif
 
 
-check_PROGRAMS =
+check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
+starpu_mpi_EXAMPLES =
 
 
 BUILT_SOURCES =
 BUILT_SOURCES =
 
 
@@ -76,7 +88,7 @@ examplebin_PROGRAMS +=				\
 stencil_stencil5_LDADD =		\
 stencil_stencil5_LDADD =		\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 
-check_PROGRAMS	+=	\
+starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5
 	stencil/stencil5
 
 
 ##################
 ##################
@@ -145,7 +157,7 @@ cholesky_mpi_cholesky_distributed_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS)
 	$(STARPU_BLAS_LDFLAGS)
 
 
-check_PROGRAMS +=					\
+starpu_mpi_EXAMPLES +=				\
 	cholesky/mpi_cholesky			\
 	cholesky/mpi_cholesky			\
 	cholesky/mpi_cholesky_distributed
 	cholesky/mpi_cholesky_distributed
 endif
 endif
@@ -154,7 +166,7 @@ endif
 # complex example #
 # complex example #
 ###################
 ###################
 
 
-examplebin_PROGRAMS +=				\
+examplebin_PROGRAMS +=			\
 	complex/mpi_complex
 	complex/mpi_complex
 
 
 complex_mpi_complex_SOURCES =		\
 complex_mpi_complex_SOURCES =		\
@@ -164,7 +176,7 @@ complex_mpi_complex_SOURCES =		\
 complex_mpi_complex_LDADD =		\
 complex_mpi_complex_LDADD =		\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 
-check_PROGRAMS	+=	\
+starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
 	complex/mpi_complex
 endif
 endif
 
 

+ 14 - 14
mpi/examples/cholesky/mpi_cholesky.c

@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-	starpu_mpi_init(&argc, &argv);
+	starpu_mpi_init(&argc, &argv, 1);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 
 
@@ -51,18 +51,18 @@ int main(int argc, char **argv)
 
 
 	if (dblockx == -1 || dblocky == -1)
 	if (dblockx == -1 || dblocky == -1)
 	{
 	{
-	     int factor;
-	     dblockx = nodes;
-	     dblocky = 1;
-	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
-	     {
-		  if (nodes % factor == 0)
-		  {
-		       dblockx = nodes/factor;
-		       dblocky = factor;
-		       break;
-		  }
-	     }
+		int factor;
+		dblockx = nodes;
+		dblocky = 1;
+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
+		{
+			if (nodes % factor == 0)
+			{
+				dblockx = nodes/factor;
+				dblocky = factor;
+				break;
+			}
+		}
 	}
 	}
 
 
 	unsigned i,j,x,y;
 	unsigned i,j,x,y;
@@ -198,7 +198,7 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 	int correctness = 1;
 	int correctness = 1;
-	for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ; x++)
 	{
 	{
 		for (y = 0; y < nblocks; y++)
 		for (y = 0; y < nblocks; y++)
 		{
 		{

+ 7 - 7
mpi/examples/cholesky/mpi_cholesky.h

@@ -57,31 +57,31 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	{
 	{
 		if (strcmp(argv[i], "-size") == 0)
 		if (strcmp(argv[i], "-size") == 0)
 		{
 		{
-		        char *argptr;
+			char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 			size = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 
 		if (strcmp(argv[i], "-dblockx") == 0)
 		if (strcmp(argv[i], "-dblockx") == 0)
 		{
 		{
-		        char *argptr;
+			char *argptr;
 			dblockx = strtol(argv[++i], &argptr, 10);
 			dblockx = strtol(argv[++i], &argptr, 10);
 		}
 		}
-		
+
 		if (strcmp(argv[i], "-dblocky") == 0)
 		if (strcmp(argv[i], "-dblocky") == 0)
 		{
 		{
-		        char *argptr;
+			char *argptr;
 			dblocky = strtol(argv[++i], &argptr, 10);
 			dblocky = strtol(argv[++i], &argptr, 10);
 		}
 		}
-	
+
 		if (strcmp(argv[i], "-nblocks") == 0)
 		if (strcmp(argv[i], "-nblocks") == 0)
 		{
 		{
-		        char *argptr;
+			char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 
 		if (strcmp(argv[i], "-nbigblocks") == 0)
 		if (strcmp(argv[i], "-nbigblocks") == 0)
 		{
 		{
-		        char *argptr;
+			char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 

+ 2 - 2
mpi/examples/cholesky/mpi_cholesky_codelets.c

@@ -79,7 +79,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
 
-	for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ; x++)
 	{
 	{
 		for (y = 0; y < nblocks; y++)
 		for (y = 0; y < nblocks; y++)
 		{
 		{
@@ -148,7 +148,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
-	for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ; x++)
 	{
 	{
 		for (y = 0; y < nblocks; y++)
 		for (y = 0; y < nblocks; y++)
 		{
 		{

+ 13 - 13
mpi/examples/cholesky/mpi_cholesky_distributed.c

@@ -42,7 +42,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
@@ -50,18 +50,18 @@ int main(int argc, char **argv)
 
 
 	if (dblockx == -1 || dblocky == -1)
 	if (dblockx == -1 || dblocky == -1)
 	{
 	{
-	     int factor;
-	     dblockx = nodes;
-	     dblocky = 1;
-	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
-	     {
-		  if (nodes % factor == 0)
-		  {
-		       dblockx = nodes/factor;
-		       dblocky = factor;
-		       break;
-		  }
-	     }
+		int factor;
+		dblockx = nodes;
+		dblocky = 1;
+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
+		{
+			if (nodes % factor == 0)
+			{
+				dblockx = nodes/factor;
+				dblocky = factor;
+				break;
+			}
+		}
 	}
 	}
 
 
 	unsigned i,j,x,y;
 	unsigned i,j,x,y;

+ 1 - 1
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -29,7 +29,7 @@
 #endif
 #endif
 
 
 /*
 /*
- *   U22
+ * U22
  */
  */
 
 
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)

+ 44 - 10
mpi/examples/complex/mpi_complex.c

@@ -18,14 +18,28 @@
 #include <interface/complex_interface.h>
 #include <interface/complex_interface.h>
 #include <interface/complex_codelet.h>
 #include <interface/complex_codelet.h>
 
 
+void display_foo_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	fprintf(stderr, "foo = %d\n", *foo);
+}
+
+struct starpu_codelet foo_display =
+{
+	.cpu_funcs = {display_foo_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int rank, nodes;
 	int rank, nodes;
 	int ret;
 	int ret;
+	int compare;
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
@@ -46,33 +60,53 @@ int main(int argc, char **argv)
 			double real2[2] = {14.0, 12.0};
 			double real2[2] = {14.0, 12.0};
 			double imaginary2[2] = {17.0, 19.0};
 			double imaginary2[2] = {17.0, 19.0};
 			starpu_data_handle_t handle2;
 			starpu_data_handle_t handle2;
-			MPI_Status status;
+
+			int *compare_ptr = &compare;
 
 
 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
+			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
+
 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
-			starpu_mpi_send(handle, 1, 10, MPI_COMM_WORLD);
+			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
+			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
 
 
-			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
-			starpu_mpi_recv(handle2, 1, 11, MPI_COMM_WORLD, &status);
 			starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
 			starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, 0);
+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+
+			{
+				// We send a dummy variable only to check communication with predefined datatypes
+				int foo=12;
+				starpu_data_handle_t foo_handle;
+				starpu_variable_data_register(&foo_handle, 0, (uintptr_t)&foo, sizeof(foo));
+				starpu_mpi_isend_detached(foo_handle, 1, 40, MPI_COMM_WORLD, NULL, NULL);
+				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
+			}
 		}
 		}
 		else if (rank == 1)
 		else if (rank == 1)
 		{
 		{
 			double real[2] = {0.0, 0.0};
 			double real[2] = {0.0, 0.0};
 			double imaginary[2] = {0.0, 0.0};
 			double imaginary[2] = {0.0, 0.0};
 			starpu_data_handle_t handle;
 			starpu_data_handle_t handle;
-			MPI_Status status;
 
 
 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
-			starpu_mpi_recv(handle, 0, 10, MPI_COMM_WORLD, &status);
+			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
-			starpu_mpi_send(handle, 0, 11, MPI_COMM_WORLD);
+			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
+
+			{
+				// We send a dummy variable only to check communication with predefined datatypes
+				int foo=12;
+				starpu_data_handle_t foo_handle;
+				starpu_variable_data_register(&foo_handle, -1, (uintptr_t)NULL, sizeof(foo));
+				starpu_mpi_irecv_detached(foo_handle, 0, 40, MPI_COMM_WORLD, NULL, NULL);
+				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
+			}
+
 		}
 		}
 	}
 	}
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return ret;
+	if (rank == 0) return !compare; else return ret;
 }
 }

+ 13 - 13
mpi/examples/mpi_lu/plu_example.c

@@ -301,7 +301,7 @@ static void init_matrix(int rank)
 		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 	}
 	}
 #endif
 #endif
-	
+
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
 #ifdef SINGLE_TMP1221
 #ifdef SINGLE_TMP1221
@@ -333,7 +333,7 @@ static void init_matrix(int rank)
 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
 			allocated_memory_extra += blocksize;
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_12_block[i][k]);
 			STARPU_ASSERT(tmp_12_block[i][k]);
-	
+
 			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
 			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
 				(uintptr_t)tmp_12_block[i][k],
 				(uintptr_t)tmp_12_block[i][k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
@@ -344,7 +344,7 @@ static void init_matrix(int rank)
 			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
 			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
 			allocated_memory_extra += blocksize;
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_21_block[i][k]);
 			STARPU_ASSERT(tmp_21_block[i][k]);
-	
+
 			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
 			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
 				(uintptr_t)tmp_21_block[i][k],
 				(uintptr_t)tmp_21_block[i][k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
@@ -381,7 +381,7 @@ static void display_grid(int rank, unsigned nblocks)
 	//if (rank == 0)
 	//if (rank == 0)
 	{
 	{
 		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
 		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
-		
+
 		unsigned i, j;
 		unsigned i, j;
 		for (j = 0; j < nblocks; j++)
 		for (j = 0; j < nblocks; j++)
 		{
 		{
@@ -428,7 +428,7 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 	starpu_data_set_default_sequential_consistency_flag(0);
 
 
-	starpu_mpi_init(NULL, NULL);
+	starpu_mpi_init(NULL, NULL, 0);
 
 
 	STARPU_ASSERT(p*q == world_size);
 	STARPU_ASSERT(p*q == world_size);
 
 
@@ -534,7 +534,7 @@ int main(int argc, char **argv)
 
 
 		y2 = calloc(size, sizeof(TYPE));
 		y2 = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(y);
 		STARPU_ASSERT(y);
-		
+
 		if (rank == 0)
 		if (rank == 0)
 		{
 		{
 			for (ind = 0; ind < size; ind++)
 			for (ind = 0; ind < size; ind++)
@@ -546,13 +546,13 @@ int main(int argc, char **argv)
 		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
 		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
 
 
 		/* Compute y2 = y2 - y */
 		/* Compute y2 = y2 - y */
-	        CPU_AXPY(size, -1.0, y, 1, y2, 1);
-	
-	        TYPE err = CPU_ASUM(size, y2, 1);
-	        int max = CPU_IAMAX(size, y2, 1);
-	
-	        fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
-	        fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
 #endif
 #endif
 	}
 	}
 
 

+ 54 - 55
mpi/examples/mpi_lu/plu_solve.c

@@ -25,19 +25,19 @@
 
 
 static double frobenius_norm(TYPE *v, unsigned n)
 static double frobenius_norm(TYPE *v, unsigned n)
 {
 {
-        double sum2 = 0.0;
+	double sum2 = 0.0;
 
 
-        /* compute sqrt(Sum(|x|^2)) */
+	/* compute sqrt(Sum(|x|^2)) */
 
 
-        unsigned i,j;
-        for (j = 0; j < n; j++)
-        for (i = 0; i < n; i++)
-        {
-                double a = fabsl((double)v[i+n*j]);
-                sum2 += a*a;
-        }
+	unsigned i,j;
+	for (j = 0; j < n; j++)
+		for (i = 0; i < n; i++)
+		{
+			double a = fabsl((double)v[i+n*j]);
+			sum2 += a*a;
+		}
 
 
-        return sqrt(sum2);
+	return sqrt(sum2);
 }
 }
 
 
 void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
 void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
@@ -105,9 +105,9 @@ static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 	/* Take a copy of the upper part of the diagonal block */
 	/* Take a copy of the upper part of the diagonal block */
 	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
 	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
 	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
 	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
-		
+
 	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
 	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
-	
+
 	free(upper_block_copy);
 	free(upper_block_copy);
 }
 }
 
 
@@ -121,7 +121,7 @@ static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
 	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
 	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
 
 
 	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
 	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
-	
+
 	free(lower_block_copy);
 	free(lower_block_copy);
 }
 }
 
 
@@ -242,7 +242,7 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 		TYPE *block;
 		TYPE *block;
 
 
 		int block_rank = get_block_rank(bi, bj);
 		int block_rank = get_block_rank(bi, bj);
-		
+
 		if (block_rank == 0)
 		if (block_rank == 0)
 		{
 		{
 			block = STARPU_PLU(get_block)(bi, bj);
 			block = STARPU_PLU(get_block)(bi, bj);
@@ -335,60 +335,59 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 
 
 	if (rank == 0)
 	if (rank == 0)
 	{
 	{
-	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
-	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
-	
-	        memset(L, 0, size*size*sizeof(TYPE));
-	        memset(U, 0, size*size*sizeof(TYPE));
-	
-	        /* only keep the lower part */
+		TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
+		TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
+
+		memset(L, 0, size*size*sizeof(TYPE));
+		memset(U, 0, size*size*sizeof(TYPE));
+
+		/* only keep the lower part */
 		unsigned i, j;
 		unsigned i, j;
-	        for (j = 0; j < size; j++)
-	        {
-	                for (i = 0; i < j; i++)
-	                {
-	                        L[j+i*size] = all_r[j+i*size];
-	                }
-	
-	                /* diag i = j */
-	                L[j+j*size] = all_r[j+j*size];
-	                U[j+j*size] = 1.0;
-	
-	                for (i = j+1; i < size; i++)
-	                {
-	                        U[j+i*size] = all_r[j+i*size];
-	                }
-	        }
-	
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < j; i++)
+			{
+				L[j+i*size] = all_r[j+i*size];
+			}
+
+			/* diag i = j */
+			L[j+j*size] = all_r[j+j*size];
+			U[j+j*size] = 1.0;
+
+			for (i = j+1; i < size; i++)
+			{
+				U[j+i*size] = all_r[j+i*size];
+			}
+		}
+
 		STARPU_PLU(display_data_content)(L, size);
 		STARPU_PLU(display_data_content)(L, size);
 		STARPU_PLU(display_data_content)(U, size);
 		STARPU_PLU(display_data_content)(U, size);
-	
-	        /* now A_err = L, compute L*U */
-	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
-	
+
+		/* now A_err = L, compute L*U */
+		CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+
 		if (display)
 		if (display)
 			fprintf(stderr, "\nLU\n");
 			fprintf(stderr, "\nLU\n");
 
 
 		STARPU_PLU(display_data_content)(L, size);
 		STARPU_PLU(display_data_content)(L, size);
-	
-	        /* compute "LU - A" in L*/
-	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
-	
-	        TYPE err = CPU_ASUM(size*size, L, 1);
-	        int max = CPU_IAMAX(size*size, L, 1);
-	
+
+		/* compute "LU - A" in L*/
+		CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
+
+		TYPE err = CPU_ASUM(size*size, L, 1);
+		int max = CPU_IAMAX(size*size, L, 1);
+
 		if (display)
 		if (display)
 			fprintf(stderr, "DISPLAY ERROR\n");
 			fprintf(stderr, "DISPLAY ERROR\n");
 
 
 		STARPU_PLU(display_data_content)(L, size);
 		STARPU_PLU(display_data_content)(L, size);
-	
-	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
-	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
-	
+
+		fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+
 		double residual = frobenius_norm(L, size);
 		double residual = frobenius_norm(L, size);
 		double matnorm = frobenius_norm(Asaved, size);
 		double matnorm = frobenius_norm(Asaved, size);
-	
+
 		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
 		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
 	}
 	}
 }
 }
-

+ 1 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -736,7 +736,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
 
 
-	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+	if (!no_prio && (i == k + 1) && (j == k +1) ) {
 		task->priority = STARPU_MAX_PRIO;
 		task->priority = STARPU_MAX_PRIO;
 	}
 	}
 
 

+ 12 - 14
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -22,7 +22,7 @@
 ///#define VERBOSE_KERNELS	1
 ///#define VERBOSE_KERNELS	1
 
 
 /*
 /*
- *   U22 
+ * U22
  */
  */
 
 
 static inline void STARPU_PLU(common_u22)(void *descr[],
 static inline void STARPU_PLU(common_u22)(void *descr[],
@@ -55,7 +55,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
 
 	switch (s) {
 	switch (s) {
 		case 0:
 		case 0:
-			CPU_GEMM("N", "N", dy, dx, dz, 
+			CPU_GEMM("N", "N", dy, dx, dz,
 				(TYPE)-1.0, right, ld21, left, ld12,
 				(TYPE)-1.0, right, ld21, left, ld12,
 				(TYPE)1.0, center, ld22);
 				(TYPE)1.0, center, ld22);
 			break;
 			break;
@@ -129,7 +129,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 	TYPE *sub11;
 	TYPE *sub11;
 	TYPE *sub12;
 	TYPE *sub12;
 
 
-	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 
 
 	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
 	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
@@ -227,7 +227,7 @@ struct starpu_codelet STARPU_PLU(cl12) = {
 };
 };
 
 
 
 
-/* 
+/*
  * U21
  * U21
  */
  */
 
 
@@ -245,7 +245,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
 
 	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
 	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
-	
+
 #ifdef VERBOSE_KERNELS
 #ifdef VERBOSE_KERNELS
 	struct debug_info *info = _args;
 	struct debug_info *info = _args;
 
 
@@ -311,7 +311,7 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 {
 {
 	STARPU_PLU(common_u21)(descr, 1, _args);
 	STARPU_PLU(common_u21)(descr, 1, _args);
 }
 }
-#endif 
+#endif
 
 
 static struct starpu_perfmodel STARPU_PLU(model_21) = {
 static struct starpu_perfmodel STARPU_PLU(model_21) = {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
@@ -345,7 +345,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 {
 {
 	TYPE *sub11;
 	TYPE *sub11;
 
 
-	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); 
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 
 
 	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
 	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
@@ -367,9 +367,9 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 				TYPE pivot;
 				TYPE pivot;
 				pivot = sub11[z+z*ld];
 				pivot = sub11[z+z*ld];
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
-		
+
 				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
 				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
-		
+
 				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
 				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[z+(z+1)*ld], ld,
@@ -385,15 +385,15 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
-				
+
 				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
 				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
-				
+
 				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
 				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[(z+1)+z*ld], 1,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[(z+1) + (z+1)*ld],ld);
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			}
-			
+
 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 			break;
 			break;
@@ -440,5 +440,3 @@ struct starpu_codelet STARPU_PLU(cl11) = {
 	.modes = {STARPU_RW},
 	.modes = {STARPU_RW},
 	.model = &STARPU_PLU(model_11)
 	.model = &STARPU_PLU(model_11)
 };
 };
-
-

+ 61 - 61
mpi/examples/stencil/stencil5.c

@@ -25,15 +25,15 @@ void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
 	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
 	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
 	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
 
 
-        //        fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
-        *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
+	//fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
 }
 }
 
 
 struct starpu_codelet stencil5_cl =
 struct starpu_codelet stencil5_cl =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {stencil5_cpu, NULL},
 	.cpu_funcs = {stencil5_cpu, NULL},
-        .nbuffers = 5,
+	.nbuffers = 5,
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 };
 
 
@@ -75,92 +75,92 @@ static void parse_args(int argc, char **argv)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int my_rank, size, x, y, loop;
-        int value=0, mean=0;
-        unsigned matrix[X][Y];
-        starpu_data_handle_t data_handles[X][Y];
+	int my_rank, size, x, y, loop;
+	int value=0, mean=0;
+	unsigned matrix[X][Y];
+	starpu_data_handle_t data_handles[X][Y];
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_init(&argc, &argv);
+	starpu_mpi_init(&argc, &argv, 1);
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        parse_args(argc, argv);
+	parse_args(argc, argv);
 
 
-        for(x = 0; x < X; x++)
+	for(x = 0; x < X; x++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        matrix[x][y] = (my_rank+1)*10 + value;
-                        value++;
-                        mean += matrix[x][y];
-                }
-        }
-        mean /= value;
-
-        for(x = 0; x < X; x++)
+			matrix[x][y] = (my_rank+1)*10 + value;
+			value++;
+			mean += matrix[x][y];
+		}
+	}
+	mean /= value;
+
+	for(x = 0; x < X; x++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == my_rank)
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == my_rank)
 			{
 			{
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
-                        }
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+			}
 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
-			      || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
+				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
 			{
 			{
-                                /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
-                        }
-                        else
+				/* I don't own that index, but will need it for my computations */
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+			}
+			else
 			{
 			{
-                                /* I know it's useless to allocate anything for this */
-                                data_handles[x][y] = NULL;
-                        }
-                        if (data_handles[x][y])
+				/* I know it's useless to allocate anything for this */
+				data_handles[x][y] = NULL;
+			}
+			if (data_handles[x][y])
 			{
 			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_data_set_tag(data_handles[x][y], (y*X)+x);
 			}
 			}
-                }
-        }
+		}
+	}
 
 
-        for(loop=0 ; loop<niter; loop++)
+	for(loop=0 ; loop<niter; loop++)
 	{
 	{
-                for (x = 1; x < X-1; x++)
+		for (x = 1; x < X-1; x++)
 		{
 		{
-                        for (y = 1; y < Y-1; y++)
+			for (y = 1; y < Y-1; y++)
 			{
 			{
-                                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
-                                                       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
-                                                       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
-                                                       0);
-                        }
-                }
-        }
-        fprintf(stderr, "Waiting ...\n");
-        starpu_task_wait_for_all();
+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
+						       0);
+			}
+		}
+	}
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-        if (display)
+	if (display)
 	{
 	{
-                fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
-                for(x = 0; x < X; x++)
+		fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
+		for(x = 0; x < X; x++)
 		{
 		{
-                        fprintf(stdout, "[%d] ", my_rank);
-                        for (y = 0; y < Y; y++)
+			fprintf(stdout, "[%d] ", my_rank);
+			for (y = 0; y < Y; y++)
 			{
 			{
-                                fprintf(stdout, "%3u ", matrix[x][y]);
-                        }
-                        fprintf(stdout, "\n");
-                }
-        }
+				fprintf(stdout, "%3u ", matrix[x][y]);
+			}
+			fprintf(stdout, "\n");
+		}
+	}
 
 
 	return 0;
 	return 0;
 }
 }

+ 5 - 1
mpi/include/starpu_mpi.h

@@ -39,8 +39,8 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
 int starpu_mpi_barrier(MPI_Comm comm);
-int starpu_mpi_init(int *argc, char ***argv);
 
 
+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
 int starpu_mpi_initialize(void) STARPU_DEPRECATED;
 int starpu_mpi_initialize(void) STARPU_DEPRECATED;
 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
 int starpu_mpi_shutdown(void);
 int starpu_mpi_shutdown(void);
@@ -66,6 +66,10 @@ int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_
 
 
 /* retrieve the current amount of communications from the current node */
 /* retrieve the current amount of communications from the current node */
 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
+
+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
+void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 181 - 152
mpi/src/starpu_mpi.c

@@ -38,24 +38,25 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 							int dest, int mpi_tag, MPI_Comm comm,
 							int dest, int mpi_tag, MPI_Comm comm,
 							unsigned detached, void (*callback)(void *), void *arg);
 							unsigned detached, void (*callback)(void *), void *arg);
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg);
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg);
+static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
 
 /* The list of requests that have been newly submitted by the application */
 /* The list of requests that have been newly submitted by the application */
 static struct _starpu_mpi_req_list *new_requests;
 static struct _starpu_mpi_req_list *new_requests;
 
 
 /* The list of detached requests that have already been submitted to MPI */
 /* The list of detached requests that have already been submitted to MPI */
 static struct _starpu_mpi_req_list *detached_requests;
 static struct _starpu_mpi_req_list *detached_requests;
-static pthread_mutex_t detached_requests_mutex;
+static _starpu_pthread_mutex_t detached_requests_mutex;
 
 
 /* Condition to wake up progression thread */
 /* Condition to wake up progression thread */
-static pthread_cond_t cond_progression;
+static _starpu_pthread_cond_t cond_progression;
 /* Condition to wake up waiting for all current MPI requests to finish */
 /* Condition to wake up waiting for all current MPI requests to finish */
-static pthread_cond_t cond_finished;
-static pthread_mutex_t mutex;
+static _starpu_pthread_cond_t cond_finished;
+static _starpu_pthread_mutex_t mutex;
 static pthread_t progress_thread;
 static pthread_t progress_thread;
 static int running = 0;
 static int running = 0;
 
 
 /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
 /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
-static pthread_mutex_t mutex_posted_requests;
+static _starpu_pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
@@ -74,11 +75,11 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 							      enum starpu_access_mode mode)
 							      enum starpu_access_mode mode)
 {
 {
 
 
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(req);
 	STARPU_ASSERT(req);
 
 
-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
 	/* Initialize the request structure */
 	/* Initialize the request structure */
 	req->submitted = 0;
 	req->submitted = 0;
@@ -101,10 +102,10 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
 	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
-	 * the request is actually submitted  */
+	 * the request is actually submitted */
 	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
 	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return req;
 	return req;
 }
 }
 
 
@@ -116,16 +117,16 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
 
 static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 
 	STARPU_ASSERT(req->ptr);
 	STARPU_ASSERT(req->ptr);
 
 
-        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, (int)req->count, &req->request);
+	_STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, (int)req->count, &req->request);
 
 
 	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
 
 
-        req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+	req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
 	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
 
 
@@ -134,7 +135,10 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 	req->submitted = 1;
 	req->submitted = 1;
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
+
+	_starpu_mpi_handle_detached_request(req);
+
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 static void _starpu_mpi_isend_size_callback(void *arg)
 static void _starpu_mpi_isend_size_callback(void *arg)
@@ -145,8 +149,8 @@ static void _starpu_mpi_isend_size_callback(void *arg)
 
 
 static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 {
 {
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
-	if (!req->needs_unpacking)
+	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
+	if (req->user_datatype == 0)
 	{
 	{
 		req->count = 1;
 		req->count = 1;
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
@@ -172,7 +176,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
 
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(public_req);
 	STARPU_ASSERT(public_req);
 
 
 	struct _starpu_mpi_req *req;
 	struct _starpu_mpi_req *req;
@@ -181,17 +185,17 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	STARPU_ASSERT(req);
 	STARPU_ASSERT(req);
 	*public_req = req;
 	*public_req = req;
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -200,13 +204,13 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 	starpu_mpi_req req;
 	starpu_mpi_req req;
 	MPI_Status status;
 	MPI_Status status;
 
 
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	memset(&status, 0, sizeof(MPI_Status));
 	memset(&status, 0, sizeof(MPI_Status));
 
 
 	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
 	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
 	starpu_mpi_wait(&req, &status);
 	starpu_mpi_wait(&req, &status);
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -218,39 +222,52 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 
 
 static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 
 	STARPU_ASSERT(req->ptr);
 	STARPU_ASSERT(req->ptr);
 
 
 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p datatype %p count %d req %p \n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, req->datatype, (int)req->count, &req->request);
 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p datatype %p count %d req %p \n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, req->datatype, (int)req->count, &req->request);
 
 
-        req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	req->submitted = 1;
 	req->submitted = 1;
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
+
+	_starpu_mpi_handle_detached_request(req);
+
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
+struct _starpu_mpi_irecv_size_callback
+{
+	starpu_data_handle_t handle;
+	struct _starpu_mpi_req *req;
+};
+
 static void _starpu_mpi_irecv_size_callback(void *arg)
 static void _starpu_mpi_irecv_size_callback(void *arg)
 {
 {
-	struct _starpu_mpi_req *req = (struct _starpu_mpi_req *) arg;
+	struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
+
+	starpu_data_unregister(callback->handle);
+	callback->req->ptr = malloc(callback->req->count);
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
-#  warning TODO: are we sure that req->count can be used as we have not released count_handle?
+#warning TODO: in some cases, callback->req->count is incorrect, we need to fix that
 #endif
 #endif
-	req->ptr = malloc(req->count);
-	_starpu_mpi_irecv_data_func(req);
+	STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld\n", callback->req->count);
+	_starpu_mpi_irecv_data_func(callback->req);
+	free(callback);
 }
 }
 
 
 static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 
-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
-	if (!req->needs_unpacking)
+	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
+	if (req->user_datatype == 0)
 	{
 	{
 		req->count = 1;
 		req->count = 1;
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
@@ -260,9 +277,11 @@ static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 	{
 	{
 		starpu_data_handle_t count_handle;
 		starpu_data_handle_t count_handle;
 
 
+		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
 		starpu_variable_data_register(&count_handle, 0, (uintptr_t)&req->count, sizeof(req->count));
 		starpu_variable_data_register(&count_handle, 0, (uintptr_t)&req->count, sizeof(req->count));
-		_starpu_mpi_irecv_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, req);
-		starpu_data_unregister_submit(count_handle);
+		callback->handle = count_handle;
+		callback->req = req;
+		_starpu_mpi_irecv_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, callback);
 	}
 	}
 }
 }
 
 
@@ -273,7 +292,7 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(public_req);
 	STARPU_ASSERT(public_req);
 
 
 	struct _starpu_mpi_req *req;
 	struct _starpu_mpi_req *req;
@@ -282,15 +301,15 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	STARPU_ASSERT(req);
 	STARPU_ASSERT(req);
 	*public_req = req;
 	*public_req = req;
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -298,11 +317,11 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 {
 {
 	starpu_mpi_req req;
 	starpu_mpi_req req;
 
 
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
 	starpu_mpi_wait(&req, status);
 	starpu_mpi_wait(&req, status);
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -314,26 +333,26 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 
 
 static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are waiting for ? */
 	/* Which is the mpi request we are waiting for ? */
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 	struct _starpu_mpi_req *req = waiting_req->other_request;
 
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	_starpu_mpi_handle_request_termination(req);
 	_starpu_mpi_handle_request_termination(req);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	int ret;
 	int ret;
 	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(waiting_req);
 	STARPU_ASSERT(waiting_req);
 	struct _starpu_mpi_req *req = *public_req;
 	struct _starpu_mpi_req *req = *public_req;
 
 
-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
 	 * to MPI yet. */
@@ -364,8 +383,8 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	*public_req = NULL;
 	*public_req = NULL;
 	free(req);
 	free(req);
 
 
-        //free(waiting_req);
-        _STARPU_MPI_LOG_OUT();
+	free(waiting_req);
+	_STARPU_MPI_LOG_OUT();
 	return ret;
 	return ret;
 }
 }
 
 
@@ -377,13 +396,13 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
 
 static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are testing for ? */
 	/* Which is the mpi request we are testing for ? */
 	struct _starpu_mpi_req *req = testing_req->other_request;
 	struct _starpu_mpi_req *req = testing_req->other_request;
 
 
-        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
+	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
 	if (*testing_req->flag)
 	if (*testing_req->flag)
 	{
 	{
@@ -395,12 +414,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	testing_req->completed = 1;
 	testing_req->completed = 1;
 	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
 	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	int ret = 0;
 	int ret = 0;
 
 
 	STARPU_ASSERT(public_req);
 	STARPU_ASSERT(public_req);
@@ -416,8 +435,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	if (submitted)
 	if (submitted)
 	{
 	{
 		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
 		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
-                STARPU_ASSERT(testing_req);
-                //		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
+		STARPU_ASSERT(testing_req);
+		//		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
 
 
 		/* Initialize the request structure */
 		/* Initialize the request structure */
 		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
 		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
@@ -427,15 +446,15 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->other_request = req;
 		testing_req->other_request = req;
 		testing_req->func = _starpu_mpi_test_func;
 		testing_req->func = _starpu_mpi_test_func;
 		testing_req->completed = 0;
 		testing_req->completed = 0;
-                testing_req->request_type = TEST_REQ;
+		testing_req->request_type = TEST_REQ;
 
 
-                _STARPU_MPI_INC_POSTED_REQUESTS(1);
-                _starpu_mpi_submit_new_mpi_request(testing_req);
+		_STARPU_MPI_INC_POSTED_REQUESTS(1);
+		_starpu_mpi_submit_new_mpi_request(testing_req);
 
 
 		/* We wait for the test request to finish */
 		/* We wait for the test request to finish */
 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
 		while (!(testing_req->completed))
 		while (!(testing_req->completed))
-                        _STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
+			_STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
 
 
 		ret = testing_req->ret;
 		ret = testing_req->ret;
@@ -448,13 +467,15 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 			*public_req = NULL;
 			*public_req = NULL;
 			free(req);
 			free(req);
 		}
 		}
+
+		free(testing_req);
 	}
 	}
 	else
 	else
 	{
 	{
 		*flag = 0;
 		*flag = 0;
 	}
 	}
 
 
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 	return ret;
 	return ret;
 }
 }
 
 
@@ -466,18 +487,18 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 
 static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 
 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
-        STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
+	STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
 
 
 	_starpu_mpi_handle_request_termination(barrier_req);
 	_starpu_mpi_handle_request_termination(barrier_req);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 int starpu_mpi_barrier(MPI_Comm comm)
 int starpu_mpi_barrier(MPI_Comm comm)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	int ret;
 	int ret;
 	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(barrier_req);
 	STARPU_ASSERT(barrier_req);
@@ -513,7 +534,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->comm = comm;
 	barrier_req->comm = comm;
 
 
-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 	_starpu_mpi_submit_new_mpi_request(barrier_req);
 	_starpu_mpi_submit_new_mpi_request(barrier_req);
 
 
 	/* We wait for the MPI request to finish */
 	/* We wait for the MPI request to finish */
@@ -524,8 +545,8 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
 
 	ret = barrier_req->ret;
 	ret = barrier_req->ret;
 
 
-        //free(waiting_req);
-        _STARPU_MPI_LOG_OUT();
+	//free(waiting_req);
+	_STARPU_MPI_LOG_OUT();
 	return ret;
 	return ret;
 }
 }
 
 
@@ -538,31 +559,39 @@ int starpu_mpi_barrier(MPI_Comm comm)
 #ifdef STARPU_MPI_VERBOSE
 #ifdef STARPU_MPI_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 {
 {
-        switch (request_type)
-                {
-                case SEND_REQ: return "SEND_REQ";
-                case RECV_REQ: return "RECV_REQ";
-                case WAIT_REQ: return "WAIT_REQ";
-                case TEST_REQ: return "TEST_REQ";
-                case BARRIER_REQ: return "BARRIER_REQ";
-                default: return "unknown request type";
-                }
+	switch (request_type)
+		{
+		case SEND_REQ: return "SEND_REQ";
+		case RECV_REQ: return "RECV_REQ";
+		case WAIT_REQ: return "WAIT_REQ";
+		case TEST_REQ: return "TEST_REQ";
+		case BARRIER_REQ: return "BARRIER_REQ";
+		default: return "unknown request type";
+		}
 }
 }
 #endif
 #endif
 
 
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 
 	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", _starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
 	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", _starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
-        if (req->request_type != BARRIER_REQ)
+	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 	{
 	{
-		if (req->needs_unpacking)
-			starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
+		if (req->user_datatype == 1)
+		{
+			if (req->request_type == RECV_REQ)
+				// req->ptr is freed by starpu_handle_unpack_data
+				starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
+			else
+				free(req->ptr);
+		}
 		else
 		else
-			MPI_Type_free(&req->datatype);
-                starpu_data_release(req->data_handle);
-        }
+		{
+			_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
+		}
+		starpu_data_release(req->data_handle);
+	}
 
 
 	if (req->request_type == RECV_REQ)
 	if (req->request_type == RECV_REQ)
 	{
 	{
@@ -579,23 +608,23 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	req->completed = 1;
 	req->completed = 1;
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 static void _starpu_mpi_submit_new_mpi_request(void *arg)
 static void _starpu_mpi_submit_new_mpi_request(void *arg)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 	struct _starpu_mpi_req *req = arg;
 
 
-        _STARPU_MPI_INC_POSTED_REQUESTS(-1);
+	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_starpu_mpi_req_list_push_front(new_requests, req);
 	_starpu_mpi_req_list_push_front(new_requests, req);
 	newer_requests = 1;
 	newer_requests = 1;
-        _STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
+	_STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 #ifdef USE_STARPU_ACTIVITY
 #ifdef USE_STARPU_ACTIVITY
@@ -617,7 +646,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
 
 static void _starpu_mpi_test_detached_requests(void)
 static void _starpu_mpi_test_detached_requests(void)
 {
 {
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 	int flag;
 	int flag;
 	MPI_Status status;
 	MPI_Status status;
 	struct _starpu_mpi_req *req, *next_req;
 	struct _starpu_mpi_req *req, *next_req;
@@ -632,7 +661,7 @@ static void _starpu_mpi_test_detached_requests(void)
 
 
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 
-                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
+		//_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
 		req->ret = MPI_Test(&req->request, &flag, &status);
 		req->ret = MPI_Test(&req->request, &flag, &status);
 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 
@@ -644,29 +673,19 @@ static void _starpu_mpi_test_detached_requests(void)
 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 
 
 		if (flag)
 		if (flag)
+		{
 			_starpu_mpi_req_list_erase(detached_requests, req);
 			_starpu_mpi_req_list_erase(detached_requests, req);
+			free(req);
+		}
 
 
-#ifdef STARPU_DEVEL
-#warning TODO fix memleak
-#endif
-		/* Detached requests are automatically allocated by the lib */
-		//if (req->detached)
-		//	free(req);
 	}
 	}
 
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
-        _STARPU_MPI_LOG_OUT();
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 {
 {
-        _STARPU_MPI_LOG_IN();
-	STARPU_ASSERT(req);
-
-	/* submit the request to MPI */
-        _STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
-	req->func(req);
-
 	if (req->detached)
 	if (req->detached)
 	{
 	{
 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -681,59 +700,68 @@ static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
 		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
 		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
 	}
-        _STARPU_MPI_LOG_OUT();
+}
+
+static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
+{
+	_STARPU_MPI_LOG_IN();
+	STARPU_ASSERT(req);
+
+	/* submit the request to MPI */
+	_STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
+	req->func(req);
+
+	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
 struct _starpu_mpi_argc_argv
 struct _starpu_mpi_argc_argv
 {
 {
+	int initialize_mpi;
 	int *argc;
 	int *argc;
 	char ***argv;
 	char ***argv;
 };
 };
 
 
 static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 {
 {
-     switch (thread_level)
-     {
-     case MPI_THREAD_SERIALIZED:
-     {
-	  _STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
-	  break;
-     }
-     case MPI_THREAD_FUNNELED:
-     {
-	  _STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
-	  break;
-     }
-     case MPI_THREAD_SINGLE:
-     {
-	  _STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
-	  break;
-     }
-     }
+	switch (thread_level)
+	{
+	case MPI_THREAD_SERIALIZED:
+	{
+		_STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
+		break;
+	}
+	case MPI_THREAD_FUNNELED:
+	{
+		_STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
+		break;
+	}
+	case MPI_THREAD_SINGLE:
+	{
+		_STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
+		break;
+	}
+	}
 }
 }
 
 
 static void *_starpu_mpi_progress_thread_func(void *arg)
 static void *_starpu_mpi_progress_thread_func(void *arg)
 {
 {
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
-	int flag;
 
 
-	MPI_Initialized(&flag);
-	_STARPU_DEBUG("MPI_Initialized %d\n", flag);
-	if (flag == 0)
+	if (argc_argv->initialize_mpi)
 	{
 	{
 		int thread_support;
 		int thread_support;
-                _STARPU_DEBUG("Calling MPI_Init_thread\n");
+		_STARPU_DEBUG("Calling MPI_Init_thread\n");
 		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
 		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
 		{
 		{
 			_STARPU_ERROR("MPI_Init_thread failed\n");
 			_STARPU_ERROR("MPI_Init_thread failed\n");
-                }
+		}
 		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
 		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
-        }
+	}
 	else
 	else
 	{
 	{
-	     int provided;
-	     MPI_Query_thread(&provided);
-	     _starpu_mpi_print_thread_level_support(provided, " has been initialized with");
+		int provided;
+		MPI_Query_thread(&provided);
+		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
 	}
 	}
 
 
 	/* notify the main thread that the progression thread is ready */
 	/* notify the main thread that the progression thread is ready */
@@ -754,7 +782,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
 		if (block)
 		if (block)
 		{
 		{
-                        _STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
+			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
 			if (barrier_running)
 			if (barrier_running)
 				/* Tell mpi_barrier */
 				/* Tell mpi_barrier */
 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
@@ -775,7 +803,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			/* handling a request is likely to block for a while
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
 			 * (on a sync_data_with_mem call), we want to let the
 			 * application submit requests in the meantime, so we
 			 * application submit requests in the meantime, so we
-			 * release the lock.  */
+			 * release the lock. */
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			_starpu_mpi_handle_new_request(req);
 			_starpu_mpi_handle_new_request(req);
 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -784,13 +812,13 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
-        STARPU_ASSERT(posted_requests == 0);
+	STARPU_ASSERT(posted_requests == 0);
 
 
-        if (flag == 0)
+	if (argc_argv->initialize_mpi)
 	{
 	{
-                _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
-                MPI_Finalize();
-        }
+		_STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
+		MPI_Finalize();
+	}
 
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
@@ -835,12 +863,12 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
 
 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
 
 
-        _STARPU_MPI_DEBUG("unique key %x\n", random_number);
+	_STARPU_MPI_DEBUG("unique key %x\n", random_number);
 #endif
 #endif
 }
 }
 
 
 static
 static
-int _starpu_mpi_initialize(int *argc, char ***argv)
+int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 {
 {
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
@@ -850,12 +878,13 @@ int _starpu_mpi_initialize(int *argc, char ***argv)
 	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	detached_requests = _starpu_mpi_req_list_new();
 	detached_requests = _starpu_mpi_req_list_new();
 
 
-        _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
 
 
 	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
 	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
+	argc_argv->initialize_mpi = initialize_mpi;
 	argc_argv->argc = argc;
 	argc_argv->argc = argc;
 	argc_argv->argv = argv;
 	argc_argv->argv = argv;
-	_STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
+	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (!running)
 	while (!running)
@@ -875,25 +904,25 @@ int _starpu_mpi_initialize(int *argc, char ***argv)
 
 
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
-	_starpu_mpi_tables_init(MPI_COMM_WORLD);
+	_starpu_mpi_cache_init(MPI_COMM_WORLD);
 	return 0;
 	return 0;
 }
 }
 
 
-int starpu_mpi_init(int *argc, char ***argv)
+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
 {
 {
-        return _starpu_mpi_initialize(argc, argv);
+	return _starpu_mpi_initialize(argc, argv, initialize_mpi);
 }
 }
 
 
 int starpu_mpi_initialize(void)
 int starpu_mpi_initialize(void)
 {
 {
-        return _starpu_mpi_initialize(NULL, NULL);
+	return _starpu_mpi_initialize(NULL, NULL, 0);
 }
 }
 
 
 int starpu_mpi_initialize_extended(int *rank, int *world_size)
 int starpu_mpi_initialize_extended(int *rank, int *world_size)
 {
 {
 	int ret;
 	int ret;
 
 
-        ret = _starpu_mpi_initialize(NULL, NULL);
+	ret = _starpu_mpi_initialize(NULL, NULL, 1);
 	if (ret == 0)
 	if (ret == 0)
 	{
 	{
 		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
 		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
@@ -908,7 +937,7 @@ int starpu_mpi_shutdown(void)
 	void *value;
 	void *value;
 	int rank, world_size;
 	int rank, world_size;
 
 
-	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
+	/* We need to get the rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
 
@@ -930,7 +959,7 @@ int starpu_mpi_shutdown(void)
 
 
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_free();
 	_starpu_mpi_comm_amounts_free();
-	_starpu_mpi_tables_free(world_size);
+	_starpu_mpi_cache_free(world_size);
 
 
 	return 0;
 	return 0;
 }
 }

+ 5 - 7
mpi/src/starpu_mpi_collective.c

@@ -34,6 +34,7 @@ void _callback_collective(void *arg)
 	if (callback_arg->nb == callback_arg->count)
 	if (callback_arg->nb == callback_arg->count)
 	{
 	{
 		callback_arg->callback(callback_arg->arg);
 		callback_arg->callback(callback_arg->arg);
+		free(callback_arg);
 	}
 	}
 }
 }
 
 
@@ -46,9 +47,6 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
 
 	MPI_Comm_rank(comm, &rank);
 	MPI_Comm_rank(comm, &rank);
 
 
-#ifdef STARPU_DEVEL
-#warning TODO: callback_arg needs to be free-ed
-#endif
 	callback_func = _callback_collective;
 	callback_func = _callback_collective;
 	callback_arg = malloc(sizeof(struct _callback_arg));
 	callback_arg = malloc(sizeof(struct _callback_arg));
 	callback_arg->count = 0;
 	callback_arg->count = 0;
@@ -64,7 +62,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
 
 	if (callback_arg)
 	if (callback_arg)
 	{
 	{
-		for(x = 0; x < count ;  x++)
+		for(x = 0; x < count ; x++)
 		{
 		{
 			if (data_handles[x])
 			if (data_handles[x])
 			{
 			{
@@ -83,7 +81,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 		}
 		}
 	}
 	}
 
 
-	for(x = 0; x < count ;  x++)
+	for(x = 0; x < count ; x++)
 	{
 	{
 		if (data_handles[x])
 		if (data_handles[x])
 		{
 		{
@@ -132,7 +130,7 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
 
 	if (callback_arg)
 	if (callback_arg)
 	{
 	{
-		for(x = 0; x < count ;  x++)
+		for(x = 0; x < count ; x++)
 		{
 		{
 			if (data_handles[x])
 			if (data_handles[x])
 			{
 			{
@@ -151,7 +149,7 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 		}
 		}
 	}
 	}
 
 
-	for(x = 0; x < count ;  x++)
+	for(x = 0; x < count ; x++)
 	{
 	{
 		if (data_handles[x])
 		if (data_handles[x])
 		{
 		{

+ 65 - 17
mpi/src/starpu_mpi_datatype.c

@@ -17,13 +17,14 @@
 
 
 #include <starpu_mpi_datatype.h>
 #include <starpu_mpi_datatype.h>
 
 
-typedef int (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
+typedef void (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
+typedef void (*handle_free_datatype_func)(MPI_Datatype *);
 
 
 /*
 /*
  * 	Matrix
  * 	Matrix
  */
  */
 
 
-static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -37,15 +38,13 @@ static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datat
 
 
 	ret = MPI_Type_commit(datatype);
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
-
-	return 0;
 }
 }
 
 
 /*
 /*
  * 	Block
  * 	Block
  */
  */
 
 
-static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -68,15 +67,13 @@ static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Dataty
 
 
 	ret = MPI_Type_commit(datatype);
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
-
-	return 0;
 }
 }
 
 
 /*
 /*
  * 	Vector
  * 	Vector
  */
  */
 
 
-static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -88,15 +85,13 @@ static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datat
 
 
 	ret = MPI_Type_commit(datatype);
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
-
-	return 0;
 }
 }
 
 
 /*
 /*
  * 	Variable
  * 	Variable
  */
  */
 
 
-static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -107,8 +102,6 @@ static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Dat
 
 
 	ret = MPI_Type_commit(datatype);
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
 	STARPU_ASSERT(ret == MPI_SUCCESS);
-
-	return 0;
 }
 }
 
 
 /*
 /*
@@ -127,21 +120,76 @@ static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID]
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 };
 
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
 {
 {
 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
 
 
-	if (id <= STARPU_MULTIFORMAT_INTERFACE_ID)
+	if (id < STARPU_MAX_INTERFACE_ID)
 	{
 	{
 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
 		STARPU_ASSERT(func);
 		STARPU_ASSERT(func);
 		func(data_handle, datatype);
 		func(data_handle, datatype);
-		return 0;
+		*user_datatype = 0;
 	}
 	}
 	else
 	else
 	{
 	{
 		/* The datatype is not predefined by StarPU */
 		/* The datatype is not predefined by StarPU */
 		*datatype = MPI_BYTE;
 		*datatype = MPI_BYTE;
-		return 1;
+		*user_datatype = 1;
+	}
+}
+
+static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
+{
+	MPI_Type_free(datatype);
+}
+
+static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
+{
+	int num_ints, num_adds, num_datatypes, combiner, i;
+	int *array_of_ints;
+	MPI_Aint *array_of_adds;
+	MPI_Datatype *array_of_datatypes;
+
+	MPI_Type_get_envelope(*datatype, &num_ints, &num_adds, &num_datatypes, &combiner);
+	if (combiner != MPI_COMBINER_NAMED)
+	{
+		array_of_ints = (int *) malloc(num_ints * sizeof(int));
+		array_of_adds = (MPI_Aint *) malloc(num_adds * sizeof(MPI_Aint));
+		array_of_datatypes = (MPI_Datatype *) malloc(num_datatypes * sizeof(MPI_Datatype));
+		MPI_Type_get_contents(*datatype, num_ints, num_adds, num_datatypes, array_of_ints, array_of_adds, array_of_datatypes);
+		for(i=0 ; i<num_datatypes ; i++)
+		{
+			_starpu_mpi_handle_free_complex_datatype(&array_of_datatypes[i]);
+		}
+		MPI_Type_free(datatype);
+		free(array_of_ints);
+		free(array_of_adds);
+		free(array_of_datatypes);
+	}
+}
+
+static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+{
+	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
+	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_VOID_INTERFACE_ID]      = NULL,
+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
+};
+
+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
+
+	if (id < STARPU_MAX_INTERFACE_ID)
+	{
+		handle_free_datatype_func func = handle_free_datatype_funcs[id];
+		STARPU_ASSERT(func);
+		func(datatype);
 	}
 	}
+	/* else the datatype is not predefined by StarPU */
 }
 }

+ 2 - 1
mpi/src/starpu_mpi_datatype.h

@@ -24,7 +24,8 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype);
+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 118 - 76
mpi/src/starpu_mpi_insert_task.c

@@ -35,22 +35,22 @@ struct _starpu_data_entry
 	void *data;
 	void *data;
 };
 };
 
 
-static struct _starpu_data_entry **sent_data = NULL;
-static struct _starpu_data_entry **received_data = NULL;
-static int cache_enabled=1;
+static struct _starpu_data_entry **_cache_sent_data = NULL;
+static struct _starpu_data_entry **_cache_received_data = NULL;
+static int _cache_enabled=1;
 
 
-void _starpu_mpi_tables_init(MPI_Comm comm)
+void _starpu_mpi_cache_init(MPI_Comm comm)
 {
 {
 	int nb_nodes;
 	int nb_nodes;
 	int i;
 	int i;
 
 
-	cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
-	if (cache_enabled == -1)
+	_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
+	if (_cache_enabled == -1)
 	{
 	{
-		cache_enabled = 1;
+		_cache_enabled = 1;
 	}
 	}
 
 
-	if (cache_enabled == 0)
+	if (_cache_enabled == 0)
 	{
 	{
 		if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
 		if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
 		return;
 		return;
@@ -58,36 +58,119 @@ void _starpu_mpi_tables_init(MPI_Comm comm)
 
 
 	MPI_Comm_size(comm, &nb_nodes);
 	MPI_Comm_size(comm, &nb_nodes);
 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
-	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
-	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
+	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
+	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
 }
 }
 
 
-void _starpu_mpi_tables_free(int world_size)
+void _starpu_mpi_cache_empty_tables(int world_size)
 {
 {
 	int i;
 	int i;
 
 
-	if (cache_enabled == 0) return;
+	if (_cache_enabled == 0) return;
 
 
 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
 
 
 	for(i=0 ; i<world_size ; i++)
 	for(i=0 ; i<world_size ; i++)
 	{
 	{
 		struct _starpu_data_entry *entry, *tmp;
 		struct _starpu_data_entry *entry, *tmp;
-		HASH_ITER(hh, sent_data[i], entry, tmp)
+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
 		{
 		{
-			HASH_DEL(sent_data[i], entry);
+			HASH_DEL(_cache_sent_data[i], entry);
 			free(entry);
 			free(entry);
 		}
 		}
-		HASH_ITER(hh, received_data[i], entry, tmp)
+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
 		{
 		{
-			HASH_DEL(received_data[i], entry);
+			HASH_DEL(_cache_received_data[i], entry);
 			free(entry);
 			free(entry);
 		}
 		}
 	}
 	}
-	free(sent_data);
-	free(received_data);
+}
+
+void _starpu_mpi_cache_free(int world_size)
+{
+	if (_cache_enabled == 0) return;
+
+	_starpu_mpi_cache_empty_tables(world_size);
+	free(_cache_sent_data);
+	free(_cache_received_data);
+}
+
+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
+{
+	int nb_nodes;
+
+	if (_cache_enabled == 0) return;
+
+	MPI_Comm_size(comm, &nb_nodes);
+	_starpu_mpi_cache_empty_tables(nb_nodes);
+}
+
+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	struct _starpu_data_entry *avail;
+	int i, nb_nodes;
+
+	if (_cache_enabled == 0) return;
+
+	MPI_Comm_size(comm, &nb_nodes);
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
+		if (avail)
+		{
+			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
+			HASH_DEL(_cache_sent_data[i], avail);
+		}
+		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
+		if (avail)
+		{
+			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
+			HASH_DEL(_cache_received_data[i], avail);
+		}
+	}
+}
+
+static
+void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
+{
+	if (_cache_enabled == 0) return NULL;
+
+	struct _starpu_data_entry *already_received;
+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
+	if (already_received == NULL)
+	{
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+	}
+	return already_received;
+}
+
+static
+void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
+{
+	if (_cache_enabled == 0) return NULL;
+
+	struct _starpu_data_entry *already_sent;
+	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
+	if (already_sent == NULL)
+	{
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
+		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
+	}
+	return already_sent;
 }
 }
 
 
 static
 static
@@ -150,47 +233,6 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 }
 }
 
 
 static
 static
-void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
-{
-	if (cache_enabled == 0) return NULL;
-
-	struct _starpu_data_entry *already_received;
-	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-	if (already_received == NULL)
-	{
-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-		entry->data = data;
-		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
-	}
-	return already_received;
-}
-
-static
-void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
-{
-	if (cache_enabled == 0) return NULL;
-
-	struct _starpu_data_entry *already_sent;
-	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
-	if (already_sent == NULL)
-	{
-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-		entry->data = data;
-		HASH_ADD_PTR(sent_data[dest], data, entry);
-		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
-	}
-	return already_sent;
-}
-
-static
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 {
 {
 	if (data && mode & STARPU_R)
 	if (data && mode & STARPU_R)
@@ -266,9 +308,9 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
 
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
 {
 {
-	if (cache_enabled)
+	if (_cache_enabled)
 	{
 	{
-		if (mode & STARPU_W)
+		if (mode & STARPU_W || mode & STARPU_REDUX)
 		{
 		{
 			if (do_execute)
 			if (do_execute)
 			{
 			{
@@ -278,11 +320,11 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 				for(n=0 ; n<size ; n++)
 				for(n=0 ; n<size ; n++)
 				{
 				{
 					struct _starpu_data_entry *already_sent;
 					struct _starpu_data_entry *already_sent;
-					HASH_FIND_PTR(sent_data[n], &data, already_sent);
+					HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
 					if (already_sent)
 					if (already_sent)
 					{
 					{
 						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
 						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
-						HASH_DEL(sent_data[n], already_sent);
+						HASH_DEL(_cache_sent_data[n], already_sent);
 					}
 					}
 				}
 				}
 			}
 			}
@@ -290,14 +332,14 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 			{
 			{
 				int mpi_rank = starpu_data_get_rank(data);
 				int mpi_rank = starpu_data_get_rank(data);
 				struct _starpu_data_entry *already_received;
 				struct _starpu_data_entry *already_received;
-				HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+				HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
 				if (already_received)
 				if (already_received)
 				{
 				{
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
 #  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
 #endif
 #endif
 					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
 					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
-					HASH_DEL(received_data[mpi_rank], already_received);
+					HASH_DEL(_cache_received_data[mpi_rank], already_received);
 					starpu_data_invalidate_submit(data);
 					starpu_data_invalidate_submit(data);
 				}
 				}
 			}
 			}
@@ -324,7 +366,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	int me, do_execute, xrank, nb_nodes;
 	int me, do_execute, xrank, nb_nodes;
 	size_t *size_on_nodes;
 	size_t *size_on_nodes;
 	size_t arg_buffer_size = 0;
 	size_t arg_buffer_size = 0;
-	char *arg_buffer;
+	char *arg_buffer = NULL;
 	int dest=0, inconsistent_execute;
 	int dest=0, inconsistent_execute;
 	int current_data = 0;
 	int current_data = 0;
 
 
@@ -339,8 +381,11 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	va_start(varg_list, codelet);
 	va_start(varg_list, codelet);
 	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
 	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
 
 
-	va_start(varg_list, codelet);
-	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
+	if (arg_buffer_size)
+	{
+		va_start(varg_list, codelet);
+		_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
+	}
 
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
 	/* Find out whether we are to execute the data because we own the data to be written to. */
 	inconsistent_execute = 0;
 	inconsistent_execute = 0;
@@ -437,13 +482,13 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 				xrank = i;
 				xrank = i;
 			}
 			}
 		}
 		}
-		free(size_on_nodes);
 		if (xrank != -1)
 		if (xrank != -1)
 		{
 		{
 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
 			do_execute = 1;
 			do_execute = 1;
 		}
 		}
 	}
 	}
+	free(size_on_nodes);
 
 
 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 
 
@@ -452,7 +497,6 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		if (xrank == -1)
 		if (xrank == -1)
 		{
 		{
 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
-			free(size_on_nodes);
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 		else
 		else
@@ -665,13 +709,11 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	tag = starpu_data_get_tag(data_handle);
 	tag = starpu_data_get_tag(data_handle);
 	if (rank == -1)
 	if (rank == -1)
 	{
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
 	}
 	}
 	if (tag == -1)
 	if (tag == -1)
 	{
 	{
-		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
-		STARPU_ABORT();
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
 	}
 	}
 	MPI_Comm_rank(comm, &me);
 	MPI_Comm_rank(comm, &me);
 
 

+ 2 - 2
mpi/src/starpu_mpi_insert_task.h

@@ -23,8 +23,8 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-void _starpu_mpi_tables_init(MPI_Comm comm);
-void _starpu_mpi_tables_free(int world_size);
+void _starpu_mpi_cache_init(MPI_Comm comm);
+void _starpu_mpi_cache_free(int world_size);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 3 - 3
mpi/src/starpu_mpi_private.h

@@ -77,7 +77,7 @@ LIST_TYPE(_starpu_mpi_req,
 	MPI_Datatype datatype;
 	MPI_Datatype datatype;
 	void *ptr;
 	void *ptr;
 	size_t count;
 	size_t count;
-	int needs_unpacking;
+	int user_datatype;
 
 
 	/* who are we talking to ? */
 	/* who are we talking to ? */
 	int srcdst;
 	int srcdst;
@@ -91,8 +91,8 @@ LIST_TYPE(_starpu_mpi_req,
 	int *flag;
 	int *flag;
 
 
 	int ret;
 	int ret;
-	pthread_mutex_t req_mutex;
-	pthread_cond_t req_cond;
+	_starpu_pthread_mutex_t req_mutex;
+	_starpu_pthread_cond_t req_cond;
 
 
 	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
 	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
 
 

+ 32 - 9
mpi/tests/Makefile.am

@@ -17,16 +17,33 @@
 CC=$(MPICC)
 CC=$(MPICC)
 CCLD=$(MPICC)
 CCLD=$(MPICC)
 
 
-if STARPU_MPI_CHECK
+if STARPU_HAVE_WINDOWS
+LOADER_BIN		=
+else
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+endif
+
+if STARPU_QUICK_CHECK
+MPI			=	$(MPIEXEC) -np 2
+else
+MPI			=	$(MPIEXEC) -np 4
+endif
+
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-LOG_COMPILER	 	=	$(MPIEXEC) -np 4
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 4
+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
-TESTS			=	$(check_PROGRAMS)
+
+if STARPU_MPI_CHECK
+TESTS			=	$(starpu_mpi_TESTS)
 endif
 endif
 
 
-check_PROGRAMS =
+check_PROGRAMS = $(LOADER) $(starpu_mpi_TESTS)
 
 
 BUILT_SOURCES =
 BUILT_SOURCES =
 
 
@@ -49,14 +66,14 @@ endif
 
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 
 
 ########################
 ########################
 # Unit testcases       #
 # Unit testcases       #
 ########################
 ########################
 
 
-check_PROGRAMS +=				\
+starpu_mpi_TESTS =				\
 	pingpong				\
 	pingpong				\
 	mpi_test				\
 	mpi_test				\
 	mpi_isend				\
 	mpi_isend				\
@@ -77,7 +94,8 @@ check_PROGRAMS +=				\
 	insert_task_owner_data			\
 	insert_task_owner_data			\
 	multiple_send				\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_scatter_gather			\
-	mpi_reduction
+	mpi_reduction				\
+	user_defined_datatype
 
 
 noinst_PROGRAMS =				\
 noinst_PROGRAMS =				\
 	pingpong				\
 	pingpong				\
@@ -100,7 +118,8 @@ noinst_PROGRAMS =				\
 	insert_task_owner_data			\
 	insert_task_owner_data			\
 	multiple_send				\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_scatter_gather			\
-	mpi_reduction
+	mpi_reduction				\
+	user_defined_datatype
 
 
 mpi_isend_LDADD =					\
 mpi_isend_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -144,6 +163,8 @@ mpi_scatter_gather_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 mpi_reduction_LDADD =			\
 mpi_reduction_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+user_defined_datatype_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 
 ring_SOURCES = ring.c
 ring_SOURCES = ring.c
 ring_async_SOURCES = ring_async.c
 ring_async_SOURCES = ring_async.c
@@ -155,6 +176,8 @@ ring_async_implicit_SOURCES += ring_kernel.cu
 endif
 endif
 mpi_reduction_SOURCES = mpi_reduction.c
 mpi_reduction_SOURCES = mpi_reduction.c
 mpi_reduction_SOURCES += mpi_reduction_kernels.c
 mpi_reduction_SOURCES += mpi_reduction_kernels.c
+user_defined_datatype_SOURCES = user_defined_datatype.c
+user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
 
 
 showcheck:
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	-cat $(TEST_LOGS) /dev/null

+ 6 - 1
mpi/tests/block_interface.c

@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	/* Node 0 will allocate a big block and only register an inner part of
@@ -132,6 +132,11 @@ int main(int argc, char **argv)
 	FPRINTF(stdout, "Rank %d is done\n", rank);
 	FPRINTF(stdout, "Rank %d is done\n", rank);
 	fflush(stdout);
 	fflush(stdout);
 
 
+	if (rank == 0 || rank == 1)
+	{
+		starpu_data_unregister(block_handle);
+		free(block);
+	}
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 7 - 1
mpi/tests/block_interface_pinned.c

@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	/* Node 0 will allocate a big block and only register an inner part of
@@ -132,6 +132,12 @@ int main(int argc, char **argv)
 
 
 	}
 	}
 
 
+	if (rank == 0 || rank == 1)
+	{
+		starpu_data_unregister(block_handle);
+		starpu_free(block);
+	}
+
 	FPRINTF(stdout, "Rank %d is done\n", rank);
 	FPRINTF(stdout, "Rank %d is done\n", rank);
 	fflush(stdout);
 	fflush(stdout);
 
 

+ 60 - 58
mpi/tests/insert_task.c

@@ -23,15 +23,15 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
-        *x = (*x + *y) / 2;
+	FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+	*x = (*x + *y) / 2;
 }
 }
 
 
 struct starpu_codelet mycodelet =
 struct starpu_codelet mycodelet =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R}
 };
 };
 
 
@@ -41,99 +41,101 @@ struct starpu_codelet mycodelet =
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
 int my_distrib(int x, int y, int nb_nodes)
 {
 {
-        return x % nb_nodes;
+	return x % nb_nodes;
 }
 }
 
 
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int rank, size, x, y;
-        int value=0, ret;
-        unsigned matrix[X][Y];
-        starpu_data_handle_t data_handles[X][Y];
+	int rank, size, x, y;
+	int value=0, ret;
+	unsigned matrix[X][Y];
+	starpu_data_handle_t data_handles[X][Y];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        for(x = 0; x < X; x++)
+	for(x = 0; x < X; x++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        matrix[x][y] = (rank+1)*10 + value;
-                        value++;
-                }
-        }
+			matrix[x][y] = (rank+1)*10 + value;
+			value++;
+		}
+	}
 #if 0
 #if 0
-        for(x = 0; x < X; x++) {
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++) {
-                        FPRINTF(stdout, "%3d ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
-        }
+	for(x = 0; x < X; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < Y; y++)
+		{
+			FPRINTF(stdout, "%3d ", matrix[x][y]);
+		}
+		FPRINTF(stdout, "\n");
+	}
 #endif
 #endif
 
 
-        for(x = 0; x < X; x++)
+	for(x = 0; x < X; x++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank)
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == rank)
 			{
 			{
-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
-                        }
-                        else
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+			}
+			else
 			{
 			{
-                                /* I don't own that index, but will need it for my computations */
-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
-                        }
-                        if (data_handles[x][y])
+				/* I don't own that index, but will need it for my computations */
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+			}
+			if (data_handles[x][y])
 			{
 			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_data_set_tag(data_handles[x][y], (y*X)+x);
 			}
 			}
-                }
-        }
+		}
+	}
 
 
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 
 
-        FPRINTF(stderr, "Waiting ...\n");
-        starpu_task_wait_for_all();
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
 
 
-        for(x = 0; x < X; x++)
+	for(x = 0; x < X; x++)
 	{
 	{
-                for (y = 0; y < Y; y++)
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        if (data_handles[x][y])
-                                starpu_data_unregister(data_handles[x][y]);
-                }
-        }
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+	}
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 #if 0
 #if 0
-        for(x = 0; x < X; x++)
+	for(x = 0; x < X; x++)
 	{
 	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++)
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < Y; y++)
 		{
 		{
-                        FPRINTF(stdout, "%3d ", matrix[x][y]);
-                }
-                FPRINTF(stdout, "\n");
-        }
+			FPRINTF(stdout, "%3d ", matrix[x][y]);
+		}
+		FPRINTF(stdout, "\n");
+	}
 #endif
 #endif
 
 
 	return 0;
 	return 0;

+ 72 - 70
mpi/tests/insert_task_block.c

@@ -25,137 +25,139 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
 	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
 	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
 	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
 
 
-        int i, j;
-        unsigned sum=0;
+	int i, j;
+	unsigned sum=0;
 
 
 	for (i = 0; i < nx; i++)
 	for (i = 0; i < nx; i++)
 	{
 	{
 		for (j = 0; j < ny; j++)
 		for (j = 0; j < ny; j++)
 		{
 		{
-                        sum += matrix[i+j*ld];
-                }
-        }
+			sum += matrix[i+j*ld];
+		}
+	}
 	for (i = 0; i < nx; i++)
 	for (i = 0; i < nx; i++)
 	{
 	{
 		for (j = 0; j < ny; j++)
 		for (j = 0; j < ny; j++)
 		{
 		{
-                        matrix[i+j*ld] = sum;///(nx*ny);
-                }
-        }
+			matrix[i+j*ld] = sum;///(nx*ny);
+		}
+	}
 }
 }
 
 
 struct starpu_codelet mycodelet =
 struct starpu_codelet mycodelet =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 1,
+	.nbuffers = 1,
 	.modes = {STARPU_RW}
 	.modes = {STARPU_RW}
 };
 };
 
 
-#define SIZE       6
-#define BLOCKS     3
+#define SIZE 6
+#define BLOCKS 3
 
 
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
 int my_distrib(int x, int y, int nb_nodes)
 {
 {
-        return x % nb_nodes;
+	return x % nb_nodes;
 }
 }
 
 
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int rank, size, x, y;
-        int ret, value=0;
-        unsigned matrix[SIZE*SIZE];
-        starpu_data_handle_t data_handles[SIZE][SIZE];
+	int rank, size, x, y;
+	int ret, value=0;
+	unsigned matrix[SIZE*SIZE];
+	starpu_data_handle_t data_handles[SIZE][SIZE];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        for(x = 0; x < SIZE; x++)
+	for(x = 0; x < SIZE; x++)
 	{
 	{
-                for (y = 0; y < SIZE; y++)
+		for (y = 0; y < SIZE; y++)
 		{
 		{
-                        matrix[x+y*SIZE] = rank*100 + value;
-                        value++;
-                }
-        }
+			matrix[x+y*SIZE] = rank*100 + value;
+			value++;
+		}
+	}
 #if 1
 #if 1
-        for(x = 0; x < SIZE; x++) {
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < SIZE; y++) {
-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
-                }
-                FPRINTF(stdout, "\n");
-        }
+	for(x = 0; x < SIZE; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < SIZE; y++)
+		{
+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+		}
+		FPRINTF(stdout, "\n");
+	}
 #endif
 #endif
 
 
-        for(x = 0; x < BLOCKS ;  x++)
+	for(x = 0; x < BLOCKS ; x++)
 	{
 	{
-                for (y = 0; y < BLOCKS; y++)
+		for (y = 0; y < BLOCKS; y++)
 		{
 		{
-                        int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank)
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == rank)
 			{
 			{
-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
-                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
-                        }
-                        else
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+			}
+			else
 			{
 			{
-                                /* I don't own that index, but will need it for my computations */
-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
-                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
-                        }
-                        if (data_handles[x][y])
+				/* I don't own that index, but will need it for my computations */
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+			}
+			if (data_handles[x][y])
 			{
 			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
 			}
 			}
-                }
-        }
+		}
+	}
 
 
-        for(x = 0; x < BLOCKS; x++)
+	for(x = 0; x < BLOCKS; x++)
 	{
 	{
-                for (y = 0; y < BLOCKS; y++)
+		for (y = 0; y < BLOCKS; y++)
 		{
 		{
-                        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
 						     STARPU_RW, data_handles[x][y],
 						     STARPU_RW, data_handles[x][y],
 						     0);
 						     0);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		}
+	}
 
 
-                }
-        }
-
-        FPRINTF(stderr, "Waiting ...\n");
-        starpu_task_wait_for_all();
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
 
 
-        for(x = 0; x < BLOCKS; x++)
+	for(x = 0; x < BLOCKS; x++)
 	{
 	{
-                for (y = 0; y < BLOCKS; y++)
+		for (y = 0; y < BLOCKS; y++)
 		{
 		{
-                        if (data_handles[x][y])
-                                starpu_data_unregister(data_handles[x][y]);
-                }
-        }
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+	}
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 #if 1
 #if 1
-        for(x = 0; x < SIZE; x++)
+	for(x = 0; x < SIZE; x++)
 	{
 	{
-                FPRINTF(stdout, "[%d] ", rank);
-                for (y = 0; y < SIZE; y++) {
-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
-                }
-                FPRINTF(stdout, "\n");
-        }
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < SIZE; y++)
+		{
+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+		}
+		FPRINTF(stdout, "\n");
+	}
 #endif
 #endif
 
 
 	return 0;
 	return 0;

+ 18 - 13
mpi/tests/insert_task_cache.c

@@ -35,7 +35,7 @@ struct starpu_codelet mycodelet =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R}
 };
 };
 
 
@@ -44,15 +44,15 @@ struct starpu_codelet mycodelet =
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x)
 int my_distrib(int x)
 {
 {
-        return x;
+	return x;
 }
 }
 
 
 void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 {
 {
-        int i;
-        int ret;
+	int i;
+	int ret;
 	unsigned v[2][N];
 	unsigned v[2][N];
-        starpu_data_handle_t data_handles[2];
+	starpu_data_handle_t data_handles[2];
 	char *string;
 	char *string;
 
 
 	string = malloc(50);
 	string = malloc(50);
@@ -61,10 +61,10 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
-        for(i = 0; i < 2; i++)
+	for(i = 0; i < 2; i++)
 	{
 	{
 		int mpi_rank = my_distrib(i);
 		int mpi_rank = my_distrib(i);
 		if (mpi_rank == rank)
 		if (mpi_rank == rank)
@@ -80,30 +80,31 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 		}
 		}
 		starpu_data_set_rank(data_handles[i], mpi_rank);
 		starpu_data_set_rank(data_handles[i], mpi_rank);
 		starpu_data_set_tag(data_handles[i], i);
 		starpu_data_set_tag(data_handles[i], i);
-        }
+	}
 
 
-        for(i = 0; i < 5; i++)
+	for(i = 0; i < 5; i++)
 	{
 	{
 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	}
 	}
 
 
-        for(i = 0; i < 5; i++)
+	for(i = 0; i < 5; i++)
 	{
 	{
 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	}
 	}
 
 
-        starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
 
 
-        for(i = 0; i < 2; i++)
+	for(i = 0; i < 2; i++)
 	{
 	{
 		starpu_data_unregister(data_handles[i]);
 		starpu_data_unregister(data_handles[i]);
-        }
+	}
 
 
 	starpu_mpi_comm_amounts_retrieve(comm_amount);
 	starpu_mpi_comm_amounts_retrieve(comm_amount);
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
+	free(string);
 }
 }
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
@@ -137,6 +138,10 @@ int main(int argc, char **argv)
 	else
 	else
 		result = 1;
 		result = 1;
 
 
+	free(comm_amount_without_cache);
+	free(comm_amount_with_cache);
+	free(string);
+
 	MPI_Finalize();
 	MPI_Finalize();
 	return !result;
 	return !result;
 }
 }

+ 52 - 49
mpi/tests/insert_task_owner.c

@@ -23,7 +23,7 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	int node;
 	int node;
 	int rank;
 	int rank;
 
 
-        starpu_codelet_unpack_args(_args, &node);
+	starpu_codelet_unpack_args(_args, &node);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
 	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
 
 
@@ -34,7 +34,7 @@ struct starpu_codelet mycodelet_r_w =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
 	.modes = {STARPU_R, STARPU_W}
 };
 };
 
 
@@ -42,7 +42,7 @@ struct starpu_codelet mycodelet_rw_r =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}
 	.modes = {STARPU_RW, STARPU_R}
 };
 };
 
 
@@ -50,7 +50,7 @@ struct starpu_codelet mycodelet_rw_rw =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW}
 	.modes = {STARPU_RW, STARPU_RW}
 };
 };
 
 
@@ -58,7 +58,7 @@ struct starpu_codelet mycodelet_w_r =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_W, STARPU_R}
 	.modes = {STARPU_W, STARPU_R}
 };
 };
 
 
@@ -66,109 +66,112 @@ struct starpu_codelet mycodelet_r_r =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R}
 	.modes = {STARPU_R, STARPU_R}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int ret, rank, size, err, node;
-        int x0=32, x1=23;
-        starpu_data_handle_t data_handlesx0;
-        starpu_data_handle_t data_handlesx1;
+	int ret, rank, size, err, node;
+	int x0=32, x1=23;
+	starpu_data_handle_t data_handlesx0;
+	starpu_data_handle_t data_handlesx1;
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        if (rank == 0)
+	if (rank != 0 && rank != 1) goto end;
+
+	if (rank == 0)
 	{
 	{
-                starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
-                starpu_data_set_rank(data_handlesx0, rank);
+		starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
+		starpu_data_set_rank(data_handlesx0, rank);
 		starpu_data_set_tag(data_handlesx0, 0);
 		starpu_data_set_tag(data_handlesx0, 0);
-                starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
-                starpu_data_set_rank(data_handlesx1, 1);
+		starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
+		starpu_data_set_rank(data_handlesx1, 1);
 		starpu_data_set_tag(data_handlesx1, 1);
 		starpu_data_set_tag(data_handlesx1, 1);
-        }
-        else if (rank == 1)
+	}
+	else if (rank == 1)
 	{
 	{
-                starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
-                starpu_data_set_rank(data_handlesx1, rank);
+		starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
+		starpu_data_set_rank(data_handlesx1, rank);
 		starpu_data_set_tag(data_handlesx1, 1);
 		starpu_data_set_tag(data_handlesx1, 1);
-                starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
-                starpu_data_set_rank(data_handlesx0, 0);
+		starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
+		starpu_data_set_rank(data_handlesx0, 0);
 		starpu_data_set_tag(data_handlesx0, 0);
 		starpu_data_set_tag(data_handlesx0, 0);
-        }
-
-	if (rank != 0 && rank != 1) goto end;
+	}
 
 
 	node = starpu_data_get_rank(data_handlesx1);
 	node = starpu_data_get_rank(data_handlesx1);
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
 	node = starpu_data_get_rank(data_handlesx0);
 	node = starpu_data_get_rank(data_handlesx0);
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
 				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
 				     0);
 				     0);
-        assert(err == -EINVAL);
+	assert(err == -EINVAL);
 
 
 	node = 1;
 	node = 1;
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
 	node = 0;
 	node = 0;
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
 	node = 0;
 	node = 0;
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
-        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
-           going to overwrite the node even though the data model clearly specifies
-           which node is going to execute the codelet */
+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+	   going to overwrite the node even though the data model clearly specifies
+	   which node is going to execute the codelet */
 	node = 0;
 	node = 0;
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
-        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
-           going to overwrite the node even though the data model clearly specifies
-           which node is going to execute the codelet */
+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+	   going to overwrite the node even though the data model clearly specifies
+	   which node is going to execute the codelet */
 	node = 0;
 	node = 0;
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 				     0);
-        assert(err == 0);
+	assert(err == 0);
 
 
-end:
 	fprintf(stderr, "Waiting ...\n");
 	fprintf(stderr, "Waiting ...\n");
-        starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
+	starpu_data_unregister(data_handlesx0);
+	starpu_data_unregister(data_handlesx1);
+
+end:
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 48 - 47
mpi/tests/insert_task_owner2.c

@@ -25,66 +25,66 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
 	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
 	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
 	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
 
 
-//        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
-//
-//        *x2 = 45;
-//        *y = 144;
-//
-        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
-        *y = (*x0 + *x1) * 100;
-        *x1 = 12;
-        *x2 = 24;
-        *x0 = 36;
-        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+	//FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+	//*x2 = 45;
+	//*y = 144;
+
+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+	*y = (*x0 + *x1) * 100;
+	*x1 = 12;
+	*x2 = 24;
+	*x0 = 36;
+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
 }
 }
 
 
 struct starpu_codelet mycodelet =
 struct starpu_codelet mycodelet =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 4,
+	.nbuffers = 4,
 	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
 	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int rank, size, err;
-        int x[3], y=0;
-        int i, ret;
-        starpu_data_handle_t data_handles[4];
+	int rank, size, err;
+	int x[3], y=0;
+	int i, ret;
+	starpu_data_handle_t data_handles[4];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        if (rank == 0)
+	if (rank == 0)
 	{
 	{
-                for(i=0 ; i<3 ; i++)
+		for(i=0 ; i<3 ; i++)
 		{
 		{
-                        x[i] = 10*(i+1);
-                        starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
-                }
-                y = -1;
-                starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
-        }
-        else if (rank == 1)
+			x[i] = 10*(i+1);
+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
+		}
+		y = -1;
+		starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
+	}
+	else if (rank == 1)
 	{
 	{
-                for(i=0 ; i<3 ; i++)
+		for(i=0 ; i<3 ; i++)
 		{
 		{
-                        x[i] = -1;
-                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
-                }
-                y=200;
-                starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
-        } else
+			x[i] = -1;
+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+		}
+		y=200;
+		starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
+	}
+	else
 	{
 	{
-                for(i=0 ; i<4 ; i++)
-                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+		for(i=0 ; i<4 ; i++)
+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
 	}
 	}
-        FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+	FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
 
 
 	for(i=0 ; i<3 ; i++)
 	for(i=0 ; i<3 ; i++)
 	{
 	{
@@ -94,23 +94,24 @@ int main(int argc, char **argv)
 	starpu_data_set_rank(data_handles[3], 1);
 	starpu_data_set_rank(data_handles[3], 1);
 	starpu_data_set_tag(data_handles[3], 3);
 	starpu_data_set_tag(data_handles[3], 3);
 
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
-                                     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
-                                     STARPU_W, data_handles[2],
-                                     STARPU_W, data_handles[3],
-                                     STARPU_EXECUTE_ON_NODE, 1, 0);
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
+				     STARPU_W, data_handles[2],
+				     STARPU_W, data_handles[3],
+				     STARPU_EXECUTE_ON_NODE, 1, 0);
 	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
-        starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
 
 
-        int *values = malloc(4 * sizeof(int *));
-        for(i=0 ; i<4 ; i++)
+	int *values = malloc(4 * sizeof(int *));
+	for(i=0 ; i<4 ; i++)
 	{
 	{
-                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
-		if (rank == 0) {
+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0)
+		{
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
 		}
 		}
-        }
+	}
         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
         FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
         FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
 
 

+ 26 - 21
mpi/tests/insert_task_owner_data.c

@@ -31,68 +31,73 @@ struct starpu_codelet mycodelet =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.cpu_funcs = {func_cpu, NULL},
-        .nbuffers = 2,
+	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW}
 	.modes = {STARPU_RW, STARPU_RW}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int rank, size, err;
-        int x[2];
-        int ret, i;
-        starpu_data_handle_t data_handles[2];
+	int rank, size, err;
+	int x[2];
+	int ret, i;
+	starpu_data_handle_t data_handles[2];
 	int values[2];
 	int values[2];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 
-        if (rank == 0)
+	if (rank == 0)
 	{
 	{
 		x[0] = 11;
 		x[0] = 11;
 		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
-        }
-        else if (rank == 1)
+	}
+	else if (rank == 1)
 	{
 	{
 		x[1] = 12;
 		x[1] = 12;
 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
 		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
-        }
+	}
 	else
 	else
 	{
 	{
 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
-        }
+	}
 
 
 	starpu_data_set_rank(data_handles[0], 0);
 	starpu_data_set_rank(data_handles[0], 0);
 	starpu_data_set_tag(data_handles[0], 0);
 	starpu_data_set_tag(data_handles[0], 0);
 	starpu_data_set_rank(data_handles[1], 1);
 	starpu_data_set_rank(data_handles[1], 1);
 	starpu_data_set_tag(data_handles[1], 1);
 	starpu_data_set_tag(data_handles[1], 1);
 
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
-                                     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
-                                     STARPU_EXECUTE_ON_DATA, data_handles[1],
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+				     STARPU_EXECUTE_ON_DATA, data_handles[1],
 				     0);
 				     0);
-        assert(err == 0);
-        starpu_task_wait_for_all();
+	assert(err == 0);
+	starpu_task_wait_for_all();
 
 
-        for(i=0 ; i<2 ; i++)
+	for(i=0 ; i<2 ; i++)
 	{
 	{
-                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
-		if (rank == 0) {
+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0)
+		{
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+			starpu_data_release(data_handles[i]);
 		}
 		}
-        }
-        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
+	}
+	FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
 	ret = 0;
 	ret = 0;
 	if (rank == 0 && (values[0] != 12 || values[1] != 144))
 	if (rank == 0 && (values[0] != 12 || values[1] != 144))
 		ret = EXIT_FAILURE;
 		ret = EXIT_FAILURE;
 
 
+	starpu_data_unregister(data_handles[0]);
+	starpu_data_unregister(data_handles[1]);
+
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 6 - 2
mpi/tests/mpi_detached_tag.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));

+ 6 - 2
mpi/tests/mpi_irecv.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));

+ 9 - 5
mpi/tests/mpi_irecv_detached.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,14 +19,18 @@
 #include <common/utils.h>
 #include <common/utils.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
 starpu_data_handle_t tab_handle;
 starpu_data_handle_t tab_handle;
 
 
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static _starpu_pthread_mutex_t mutex = _STARPU_PTHREAD_MUTEX_INITIALIZER;
+static _starpu_pthread_cond_t cond = _STARPU_PTHREAD_COND_INITIALIZER;
 
 
 void callback(void *arg __attribute__((unused)))
 void callback(void *arg __attribute__((unused)))
 {
 {
@@ -58,7 +62,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));

+ 6 - 2
mpi/tests/mpi_isend.c

@@ -18,7 +18,11 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));

+ 21 - 13
mpi/tests/mpi_isend_detached.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,21 +20,22 @@
 #include <pthread.h>
 #include <pthread.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 #define SIZE	16
 #define SIZE	16
 
 
-static float *tab;
-static starpu_data_handle_t tab_handle;
+static _starpu_pthread_mutex_t mutex = _STARPU_PTHREAD_MUTEX_INITIALIZER;
+static _starpu_pthread_cond_t cond = _STARPU_PTHREAD_COND_INITIALIZER;
 
 
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-
-void callback(void *arg __attribute__((unused)))
+void callback(void *arg)
 {
 {
-	unsigned *sent = arg;
+	unsigned *completed = arg;
 
 
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	*sent = 1;
+	*completed = 1;
 	_STARPU_PTHREAD_COND_SIGNAL(&cond);
 	_STARPU_PTHREAD_COND_SIGNAL(&cond);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 }
 }
@@ -42,6 +43,8 @@ void callback(void *arg __attribute__((unused)))
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ret, rank, size;
 	int ret, rank, size;
+	float *tab;
+	starpu_data_handle_t tab_handle;
 
 
 	MPI_Init(NULL, NULL);
 	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -58,7 +61,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
@@ -83,8 +86,13 @@ int main(int argc, char **argv)
 		}
 		}
 		else
 		else
 		{
 		{
-			MPI_Status status;
-			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+			int received = 0;
+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
+
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!received)
+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		}
 		}
 	}
 	}
 
 

+ 12 - 12
mpi/tests/mpi_reduction.c

@@ -65,16 +65,16 @@ int my_distrib(int x, int nb_nodes)
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int my_rank, size, x, y, i;
 	int my_rank, size, x, y, i;
-        long int *vector;
+	long int *vector;
 	long int dot, sum=0;
 	long int dot, sum=0;
-        starpu_data_handle_t *handles;
+	starpu_data_handle_t *handles;
 	starpu_data_handle_t dot_handle;
 	starpu_data_handle_t dot_handle;
 
 
 	int nb_elements, step, loops;
 	int nb_elements, step, loops;
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 	loops = 5;
 	loops = 5;
 
 
 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
-        for(x = 0; x < nb_elements; x+=step)
+	for(x = 0; x < nb_elements; x+=step)
 	{
 	{
 		int mpi_rank = my_distrib(x/step, size);
 		int mpi_rank = my_distrib(x/step, size);
 		if (mpi_rank == my_rank)
 		if (mpi_rank == my_rank)
@@ -94,7 +94,7 @@ int main(int argc, char **argv)
 				vector[x+y] = x+y+1;
 				vector[x+y] = x+y+1;
 			}
 			}
 		}
 		}
-        }
+	}
 	if (my_rank == 0) {
 	if (my_rank == 0) {
 		dot = 14;
 		dot = 14;
 		sum = (nb_elements * (nb_elements + 1)) / 2;
 		sum = (nb_elements * (nb_elements + 1)) / 2;
@@ -109,7 +109,7 @@ int main(int argc, char **argv)
 
 
 
 
 	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
 	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
-        for(x = 0; x < nb_elements; x+=step)
+	for(x = 0; x < nb_elements; x+=step)
 	{
 	{
 		int mpi_rank = my_distrib(x/step, size);
 		int mpi_rank = my_distrib(x/step, size);
 		if (mpi_rank == my_rank)
 		if (mpi_rank == my_rank)
@@ -146,10 +146,10 @@ int main(int argc, char **argv)
 		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
 		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
 	}
 	}
 
 
-        fprintf(stderr, "Waiting ...\n");
-        starpu_task_wait_for_all();
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
 
 
-        for(x = 0; x < nb_elements; x+=step)
+	for(x = 0; x < nb_elements; x+=step)
 	{
 	{
 		if (handles[x]) starpu_data_unregister(handles[x]);
 		if (handles[x]) starpu_data_unregister(handles[x]);
 	}
 	}
@@ -165,10 +165,10 @@ int main(int argc, char **argv)
 
 
 	if (my_rank == 0)
 	if (my_rank == 0)
 	{
 	{
-                fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
-                fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
+		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
+		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
 		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
 		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
-        }
+	}
 
 
 	return 0;
 	return 0;
 }
 }

+ 15 - 15
mpi/tests/mpi_scatter_gather.c

@@ -19,7 +19,7 @@
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
 int my_distrib(int x, int y, int nb_nodes)
 {
 {
-        return (x+y) % nb_nodes;
+	return (x+y) % nb_nodes;
 }
 }
 
 
 void cpu_codelet(void *descr[], void *_args)
 void cpu_codelet(void *descr[], void *_args)
@@ -32,7 +32,7 @@ void cpu_codelet(void *descr[], void *_args)
 	float factor;
 	float factor;
 
 
 	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-        starpu_codelet_unpack_args(_args, &rank);
+	starpu_codelet_unpack_args(_args, &rank);
 	factor = block[0];
 	factor = block[0];
 
 
 	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
 	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
@@ -68,9 +68,9 @@ void rcallback(void *arg __attribute__((unused)))
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        int rank, nodes;
+	int rank, nodes;
 	float ***bmat = NULL;
 	float ***bmat = NULL;
-        starpu_data_handle_t *data_handles;
+	starpu_data_handle_t *data_handles;
 
 
 	unsigned i,j,x,y;
 	unsigned i,j,x,y;
 
 
@@ -81,7 +81,7 @@ int main(int argc, char **argv)
 
 
 	int ret = starpu_init(NULL);
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
@@ -135,10 +135,10 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 	/* Allocate data handles and register data to StarPU */
 	/* Allocate data handles and register data to StarPU */
-        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
-        for(x = 0; x < nblocks ;  x++)
+	data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
+	for(x = 0; x < nblocks ; x++)
 	{
 	{
-                for (y = 0; y < nblocks; y++)
+		for (y = 0; y < nblocks; y++)
 		{
 		{
 			int mpi_rank = my_distrib(x, y, nodes);
 			int mpi_rank = my_distrib(x, y, nodes);
 			if (rank == 0)
 			if (rank == 0)
@@ -158,19 +158,19 @@ int main(int argc, char **argv)
 				/* I know it's useless to allocate anything for this */
 				/* I know it's useless to allocate anything for this */
 				data_handles[x+y*nblocks] = NULL;
 				data_handles[x+y*nblocks] = NULL;
 			}
 			}
-                        if (data_handles[x+y*nblocks])
+			if (data_handles[x+y*nblocks])
 			{
 			{
-                                starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
-                                starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
+				starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
+				starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
 			}
 			}
-                }
-        }
+		}
+	}
 
 
 	/* Scatter the matrix among the nodes */
 	/* Scatter the matrix among the nodes */
 	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
 	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
 
 
 	/* Calculation */
 	/* Calculation */
-	for(x = 0; x < nblocks*nblocks ;  x++)
+	for(x = 0; x < nblocks*nblocks ; x++)
 	{
 	{
 		if (data_handles[x])
 		if (data_handles[x])
 		{
 		{
@@ -222,7 +222,7 @@ int main(int argc, char **argv)
 #endif
 #endif
 
 
 	// Free memory
 	// Free memory
-        free(data_handles);
+	free(data_handles);
 	if (rank == 0)
 	if (rank == 0)
 	{
 	{
 		for(x=0 ; x<nblocks ; x++)
 		for(x=0 ; x<nblocks ; x++)

+ 8 - 3
mpi/tests/mpi_test.c

@@ -18,7 +18,12 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
@@ -43,7 +48,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
@@ -60,7 +65,7 @@ int main(int argc, char **argv)
 
 
 		if ((loop % 2) == (rank%2))
 		if ((loop % 2) == (rank%2))
 		{
 		{
-                        starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 		}
 		}
 		else
 		else
 		{
 		{

+ 41 - 36
mpi/tests/multiple_send.c

@@ -22,15 +22,15 @@
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ret, rank, size;
 	int ret, rank, size;
-        unsigned send[2] = {42, 11};
-        unsigned recv[2] = {33, 33};
-        starpu_mpi_req req[2];
-        starpu_data_handle_t send_handle[2];
-        starpu_data_handle_t recv_handle[2];
+	unsigned send[2] = {42, 11};
+	unsigned recv[2] = {33, 33};
+	starpu_mpi_req req[2];
+	starpu_data_handle_t send_handle[2];
+	starpu_data_handle_t recv_handle[2];
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(&argc, &argv);
+	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
@@ -40,8 +40,8 @@ int main(int argc, char **argv)
 		if (rank == 0)
 		if (rank == 0)
 			FPRINTF(stderr, "We need at least 2 processes.\n");
 			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 
-                starpu_mpi_shutdown();
-                starpu_shutdown();
+		starpu_mpi_shutdown();
+		starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
@@ -50,42 +50,47 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
 	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
 	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
 	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
 
 
-        if (rank == 0)
+	if (rank == 0)
 	{
 	{
-                starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
-                starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
-        }
-        else if (rank == 1)
+		starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
+		starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
+	}
+	else if (rank == 1)
 	{
 	{
-                starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
-                starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
-        }
+		starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
+		starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
+	}
 
 
-        if (rank == 0 || rank == 1)
+	if (rank == 0 || rank == 1)
 	{
 	{
-                int nb_req=2;
-                while (nb_req)
+		int nb_req=2;
+		while (nb_req)
 		{
 		{
-                        int r=0;
-                        for(r=0 ; r<2 ; r++)
+			int r=0;
+			for(r=0 ; r<2 ; r++)
 			{
 			{
-                                if (req[r])
+				if (req[r])
 				{
 				{
-                                        int finished = 0;
-                                        MPI_Status status;
-                                        starpu_mpi_test(&req[r], &finished, &status);
-                                        STARPU_ASSERT(finished != -1);
-                                        if (finished)
+					int finished = 0;
+					MPI_Status status;
+					starpu_mpi_test(&req[r], &finished, &status);
+					STARPU_ASSERT(finished != -1);
+					if (finished)
 					{
 					{
-                                                FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
-                                                req[r] = NULL;
-                                                nb_req--;
-                                        }
-                                }
-                        }
-                }
-        }
-        FPRINTF(stderr, "[%d] All requests finished\n", rank);
+						FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
+						req[r] = NULL;
+						nb_req--;
+					}
+				}
+			}
+		}
+	}
+	FPRINTF(stderr, "[%d] All requests finished\n", rank);
+
+	starpu_data_unregister(send_handle[0]);
+	starpu_data_unregister(send_handle[1]);
+	starpu_data_unregister(recv_handle[0]);
+	starpu_data_unregister(recv_handle[1]);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();

+ 10 - 3
mpi/tests/pingpong.c

@@ -18,7 +18,12 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+
 #define SIZE	16
 #define SIZE	16
 
 
 float *tab;
 float *tab;
@@ -43,7 +48,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	tab = malloc(SIZE*sizeof(float));
 	tab = malloc(SIZE*sizeof(float));
@@ -69,9 +74,11 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 
 
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
-
 	MPI_Finalize();
 	MPI_Finalize();
 
 
 	return 0;
 	return 0;

+ 2 - 1
mpi/tests/ring.c

@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
@@ -119,6 +119,7 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 
 
+	starpu_data_unregister(token_handle);
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 2 - 1
mpi/tests/ring_async.c

@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
@@ -123,6 +123,7 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 
 
+	starpu_data_unregister(token_handle);
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 4 - 3
mpi/tests/ring_async_implicit.c

@@ -65,7 +65,7 @@ int main(int argc, char **argv)
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL);
+	ret = starpu_mpi_init(NULL, NULL, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
@@ -118,13 +118,14 @@ int main(int argc, char **argv)
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
+	starpu_data_unregister(token_handle);
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	if (rank == last_rank)
 	if (rank == last_rank)
 	{
 	{
-                FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
-                STARPU_ASSERT(token == nloops*size);
+		FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
+		STARPU_ASSERT(token == nloops*size);
 	}
 	}
 
 
 	return 0;
 	return 0;

+ 97 - 0
mpi/tests/user_defined_datatype.c

@@ -0,0 +1,97 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <interface/complex_interface.h>
+#include <interface/complex_codelet.h>
+
+void display_double_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	double *foo = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	fprintf(stderr, "foo = %f\n", *foo);
+}
+
+struct starpu_codelet double_display =
+{
+	.cpu_funcs = {display_double_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+void test_handle(starpu_data_handle_t handle, struct starpu_codelet *codelet, int rank)
+{
+	starpu_data_set_rank(handle, 1);
+	starpu_data_set_tag(handle, 42);
+
+	if (rank == 0)
+	{
+		starpu_insert_task(codelet, STARPU_R, handle, 0);
+	}
+	starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handle, 0, NULL, NULL);
+	if (rank == 0)
+	{
+		starpu_insert_task(codelet, STARPU_R, handle, 0);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2)
+	{
+		fprintf(stderr, "This program needs at least 2 nodes\n");
+		ret = 77;
+	}
+	else
+	{
+		double real[2] = {0.0, 0.0};
+		double imaginary[2] = {0.0, 0.0};
+		double foo=8;
+		starpu_data_handle_t handle_complex;
+		starpu_data_handle_t handle_var;
+
+		if (rank == 1)
+		{
+			foo = 42;
+			real[0] = 12.0;
+			real[1] = 45.0;
+			imaginary[0] = 7.0;
+			imaginary[1] = 42.0;
+		}
+		starpu_complex_data_register(&handle_complex, 0, real, imaginary, 2);
+		starpu_variable_data_register(&handle_var, 0, (uintptr_t)&foo, sizeof(double));
+
+		test_handle(handle_var, &double_display, rank);
+		test_handle(handle_complex, &cl_display, rank);
+
+		starpu_data_unregister(handle_complex);
+		starpu_data_unregister(handle_var);
+	}
+	starpu_task_wait_for_all();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 1 - 0
socl/examples/Makefile.am

@@ -63,6 +63,7 @@ SOCL_EXAMPLES +=		\
 basic_basic_SOURCES = basic/basic.c
 basic_basic_SOURCES = basic/basic.c
 clinfo_clinfo_SOURCES = clinfo/clinfo.c
 clinfo_clinfo_SOURCES = clinfo/clinfo.c
 matmul_matmul_SOURCES = matmul/matmul.c
 matmul_matmul_SOURCES = matmul/matmul.c
+matmul_matmul_LDADD = -lm
 mansched_mansched_SOURCES = mansched/mansched.c
 mansched_mansched_SOURCES = mansched/mansched.c
 
 
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)

+ 3 - 3
socl/examples/clinfo/clinfo.c

@@ -288,9 +288,9 @@ main(void) {
 
 
                GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
                GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
                GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
                GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
-               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
-               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
-               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 256);
                GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
                GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
 
 
                printf("\n");
                printf("\n");

+ 54 - 59
socl/src/cl_createbuffer.c

@@ -54,6 +54,8 @@ soclCreateBuffer(cl_context   context,
                void *       host_ptr,
                void *       host_ptr,
                cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0
                cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0
 {
 {
+   cl_mem mem;
+
    if (errcode_ret != NULL)
    if (errcode_ret != NULL)
       *errcode_ret = CL_SUCCESS;
       *errcode_ret = CL_SUCCESS;
 
 
@@ -81,68 +83,61 @@ soclCreateBuffer(cl_context   context,
       return NULL;
       return NULL;
    }
    }
 
 
-   {
-      cl_mem mem;
 
 
-      //Alloc cl_mem structure
-      mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject);
-      if (mem == NULL) {
+   //Alloc cl_mem structure
+   mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject);
+   if (mem == NULL) {
+      if (errcode_ret != NULL)
+         *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+      return NULL;
+   }
+   
+   mem->ptr = NULL;
+   mem->map_count = 0;
+   gc_entity_store(&mem->context, context);
+   mem->flags = flags;
+   mem->size = size;
+   mem->host_ptr = host_ptr;
+
+   #ifdef DEBUG
+   static int id = 0;
+   mem->id = id++;
+   #endif
+
+   mem_object_store(mem);
+
+   //TODO: we shouldn't allocate the buffer ourselves. StarPU allocates it if a NULL pointer is given
+
+   // If not MEM_USE_HOST_PTR, we need to alloc the buffer ourselves
+   if (!(flags & CL_MEM_USE_HOST_PTR)) {
+      mem->ptr = valloc(size);
+      if (mem->ptr == NULL) {
          if (errcode_ret != NULL)
          if (errcode_ret != NULL)
-            *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+            *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+         free(mem);
          return NULL;
          return NULL;
       }
       }
-      
-      mem->ptr = NULL;
-      mem->map_count = 0;
-      gc_entity_store(&mem->context, context);
-      mem->flags = flags;
-      mem->size = size;
-      mem->host_ptr = host_ptr;
-
-      #ifdef DEBUG
-      static int id = 0;
-      mem->id = id++;
-      #endif
-
-      mem_object_store(mem);
-
-      //TODO: we shouldn't allocate the buffer ourselves. StarPU allocates it if a NULL pointer is given
-
-      // If not MEM_USE_HOST_PTR, we need to alloc the buffer ourselves
-      if (!(flags & CL_MEM_USE_HOST_PTR)) {
-         mem->ptr = valloc(size);
-         if (mem->ptr == NULL) {
-            if (errcode_ret != NULL)
-               *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
-            free(mem);
-            return NULL;
-         }
-         //The buffer doesn't contain meaningful data
-         mem->scratch = 1;
-      }
-      else {
-         //The buffer may contain meaningful data
-         mem->scratch = 0;
-         mem->ptr = host_ptr;
-      }
-
-      // Access mode
-      if (flags & CL_MEM_READ_ONLY)
-         mem->mode = CL_MEM_READ_ONLY;
-      else if (flags & CL_MEM_WRITE_ONLY)
-         mem->mode = CL_MEM_WRITE_ONLY;
-      else
-         mem->mode = CL_MEM_READ_WRITE;
-
-      // Perform data copy if necessary
-      if (flags & CL_MEM_COPY_HOST_PTR)
-         memcpy(mem->ptr, host_ptr, size);
-      
-      // Create StarPU buffer (on home node? what's this?)
-      starpu_variable_data_register(&mem->handle, 0, (uintptr_t)mem->ptr, size); 
-
-      DEBUG_MSG("[Buffer %d] Initialized (cl_mem %p handle %p)\n", mem->id, mem, mem->handle);
-      
-      return mem;
+      //The buffer doesn't contain meaningful data
+      mem->scratch = 1;
+   }
+   else {
+      //The buffer may contain meaningful data
+      mem->scratch = 0;
+      mem->ptr = host_ptr;
    }
    }
+
+   // Access mode
+   mem->mode = flags & CL_MEM_READ_ONLY  ? CL_MEM_READ_ONLY :
+               flags & CL_MEM_WRITE_ONLY ? CL_MEM_WRITE_ONLY : CL_MEM_READ_WRITE;
+
+   // Perform data copy if necessary
+   if (flags & CL_MEM_COPY_HOST_PTR)
+      memcpy(mem->ptr, host_ptr, size);
+   
+   // Create StarPU buffer (on home node? what's this?)
+   starpu_variable_data_register(&mem->handle, 0, (uintptr_t)mem->ptr, size); 
+
+   DEBUG_MSG("[Buffer %d] Initialized (cl_mem %p handle %p)\n", mem->id, mem, mem->handle);
+   
+   return mem;
 }
 }

+ 0 - 0
socl/src/cl_enqueuendrangekernel.c


Some files were not shown because too many files changed in this diff