13 years ago · cab880bfd5
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 
																 /autom4te.cache
															
 
																 /libtool
															
 
																 /aclocal.m4
															
 
																+/build
															
 
																 /build-aux
															
 
																 /GPATH
															
 
																 /GRTAGS
															
@@ -28,7 +29,7 @@ starpu.log
 
																 /tests/datawizard/handle_to_pointer
															
 
																 /tests/datawizard/data_lookup
															
 
																 /doc/stamp-vti
															
 
																-/doc/version.texi
															
 
																+/doc/chapters/version.texi
															
 
																 /examples/basic_examples/block
															
 
																 /examples/basic_examples/hello_world
															
 
																 /examples/basic_examples/mult
															
@@ -289,3 +290,4 @@ starpu.log
 
																 /gcc-plugin/tests/opencl
															
 
																 /gcc-plugin/tests/registered
															
 
																 /gcc-plugin/tests/warn-unregistered
															
 
																+/cyclomatic-complexity.html
															
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,19 +1,17 @@
 
																 Cédric Augonnet <cedric.augonnet@inria.fr>
															
 
																-Nicolas Collin <nicolas.collin@inria.fr>
															
 
																+William Braik <wbraik@gmail.com>
															
 
																 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
															
 
																 Nicolas Collin <nicolas.collin@inria.fr>
															
 
																+Yann Courtois <yann.courtois33@gmail.com>
															
 
																+Jean-Marie Couteyen <jm.couteyen@gmail.com>
															
 
																 Nathalie Furmento <nathalie.furmento@labri.fr>
															
 
																+David Gómez <david_gomez1380@yahoo.com.mx>
															
 
																 Sylvain Henry <sylvain.henry@inria.fr>
															
 
																+Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																+Antoine Lucas <antoine.lucas.33@gmail.com>
															
 
																+Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
															
 
																 Cyril Roelandt <cyril.roelandt@inria.fr>
															
 
																+Anthony Roy <theanthony33@gmail.com>
															
 
																 François Tessier <francois.tessier@inria.fr>
															
 
																 Samuel Thibault <samuel.thibault@labri.fr>
															
 
																-Pierre André Wacrenier <wacrenier@labri.fr>
															
 
																-William Braik <wbraik@gmail.com>
															
 
																-Yann Courtois <yann.courtois33@gmail.com>
															
 
																-Jean-Marie Couteyen <jm.couteyen@gmail.com>
															
 
																-Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																-Anthony Roy <theanthony33@gmail.com>
															
 
																-David Gómez <david_gomez1380@yahoo.com.mx>
															
 
																-Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
															
 
																-Antoine Lucas <antoine.lucas.33@gmail.com>
															
 
																-
															
 
																+Pierre-André Wacrenier <wacrenier@labri.fr>
															
--- a/ChangeLog
+++ b/ChangeLog
@@ -56,6 +56,11 @@ New features:
 
																         - When exchanging user-defined data interfaces, the size of
															
 
																 	  the data is the size returned by the pack operation, i.e
															
 
																 	  data with dynamic size can now be exchanged with StarPU-MPI.
															
 
																+  * Add experimental simgrid support, to simulate execution with various
															
 
																+    number of CPUs, GPUs, amount of memory, etc.
															
 
																+  * Add support for OpenCL simulators (which provide simulated execution time)
															
 
																+  * Add support for Temanejo, a task graph debugger
															
 
																+  * Theoretical bound lp output now includes data transfer time.
															
 
																 Changes:
															
 
																   * Fix the block filter functions.
															
@@ -80,6 +85,7 @@ Changes:
 
																   * Cell:
															
 
																     - It is no longer possible to enable the cell support via the
															
 
																       gordon driver
															
 
																+  * Fix data transfer arrows in paje traces
															
 
																 Small changes:
															
 
																   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
															
@@ -91,7 +97,6 @@ Small changes:
 
																   * Fix forcing calibration of never-calibrated archs.
															
 
																   * CUDA applications are no longer compiled with the "-arch sm_13"
															
 
																     option. It is specifically added to applications which need it.
															
 
																-  * Documentation is not built if necessary tools are missing
															
 
																 StarPU 1.0.3 (svn revision 7379)
															
 
																 ==============================================
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,3 +124,27 @@ showcheck:
 
																 ctags-local:
															
 
																 	$(CTAGS) -R -I LIST_TYPE
															
 
																+
															
 
																+
															
 
																+# Cyclomatic complexity reports.
															
 
																+
															
 
																+# The pmccabe tool, see <http://www.parisc-linux.org/~bame/pmccabe/>.
															
 
																+PMCCABE = pmccabe
															
 
																+
															
 
																+VC_URL = "https://gforge.inria.fr/scm/viewvc.php/trunk/%FILENAME%?view=markup&root=starpu"
															
 
																+
															
 
																+# Generate a cyclomatic complexity report.  Note that examples and tests are
															
 
																+# excluded because they're not particularly relevant, and more importantly
															
 
																+# they all have a function called `main', which clobbers the report.
															
 
																+cyclomatic-complexity.html:
															
 
																+	$(PMCCABE)								\
															
 
																+	  `find \( -name examples -o -name tests -o -path ./tools/dev/experimental \) -prune -o -name \*.c` \
															
 
																+	  | sort -nr								\
															
 
																+	  | $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html			\
															
 
																+		   -v lang=html -v name="$(PACKAGE_NAME)"			\
															
 
																+		   -v vcurl=$(VC_URL)						\
															
 
																+		   -v url="$(PACKAGE_URL)"					\
															
 
																+		   -v css=${top_srcdir}/build-aux/pmccabe.css			\
															
 
																+		   -v cut_dir=${top_srcdir}/					\
															
 
																+		   > $@-tmp
															
 
																+	mv $@-tmp $@
															
--- a/README
+++ b/README
@@ -31,7 +31,7 @@ executed as efficiently as possible.
 
																 +------------------------
															
 
																 | I.b. What StarPU is not
															
 
																-StarPU is not a new language, and it does not extends existing languages either.
															
 
																+StarPU is not a new language, and it does not extend existing languages either.
															
 
																 StarPU does not help to write computation kernels.
															
 
																 +---------------------------------
															
@@ -76,11 +76,13 @@ advantage of their specificities in a portable fashion.
 
																    units according to the machine topology. For more details on hwloc, see
															
 
																    http://www.open-mpi.org/projects/hwloc/ .
															
 
																- * To build the StarPU-Top tool the following are also required:
															
 
																-   * libqt4 >= 4.7
															
 
																+ * To build the StarPU-Top tool the following packages (along with
															
 
																+   their development files) are also required:
															
 
																+   * libqt4-dev >= 4.7
															
 
																    * libqt4-network
															
 
																    * libqt4-opengl
															
 
																    * libqt4-sql
															
 
																+   * qt4-qmake
															
 
																 ++=====================++
															
 
																 || III. Getting StarPU ||
															
--- a/build-aux/pmccabe.css
+++ b/build-aux/pmccabe.css
@@ -0,0 +1,159 @@
 
																+body {
															
 
																+    font-family: Helvetica, sans-serif;
															
 
																+}
															
 
																+
															
 
																+.page_title {
															
 
																+    font: 18pt Georgia, serif;
															
 
																+    color: darkred;
															
 
																+}
															
 
																+
															
 
																+.section_title {
															
 
																+    font: 14pt Georgia, serif;
															
 
																+    color: darkred;
															
 
																+}
															
 
																+
															
 
																+.report_timestamp {
															
 
																+    color: darkred;
															
 
																+    font-weight: bold;
															
 
																+}
															
 
																+
															
 
																+.function_src {
															
 
																+    text-align: left;
															
 
																+    background: white;
															
 
																+}
															
 
																+
															
 
																+.resume_table {
															
 
																+}
															
 
																+
															
 
																+.resume_header_entry {
															
 
																+    color: black;
															
 
																+}
															
 
																+
															
 
																+.resume_number_entry {
															
 
																+    color: darkred;
															
 
																+    font-weight: bold;
															
 
																+    text-align: right;
															
 
																+}
															
 
																+
															
 
																+.ranges_table {
															
 
																+    border-spacing: 0px;
															
 
																+    border-bottom: solid 2px black;
															
 
																+    border-top: solid 2px black;
															
 
																+    border-left: solid 2px black;
															
 
																+    border-right: solid 2px black;
															
 
																+}
															
 
																+
															
 
																+.ranges_header_entry {
															
 
																+    padding: 5px;
															
 
																+    border-bottom: solid 1px black;
															
 
																+    font-size: 1em;
															
 
																+    font-weight: bold;
															
 
																+    color: darkred;
															
 
																+    text-align: left;
															
 
																+}
															
 
																+
															
 
																+.ranges_entry {
															
 
																+}
															
 
																+
															
 
																+.ranges_entry_simple {
															
 
																+    background: #87ff75;
															
 
																+}
															
 
																+
															
 
																+.ranges_entry_moderate {
															
 
																+    background: #fffc60;
															
 
																+}
															
 
																+
															
 
																+.ranges_entry_high {
															
 
																+    background: #ff5a5d;
															
 
																+}
															
 
																+
															
 
																+.ranges_entry_untestable {
															
 
																+    background: #993300
															
 
																+}
															
 
																+
															
 
																+
															
 
																+.function_table {
															
 
																+    border-spacing: 0px;
															
 
																+    border-bottom: solid 2px black;
															
 
																+    border-top: solid 2px black;
															
 
																+    border-left: solid 2px black;
															
 
																+    border-right: solid 2px black;
															
 
																+}
															
 
																+
															
 
																+.function_table_caption {
															
 
																+    font-size: 1.1em;
															
 
																+    font-weight: bold;
															
 
																+    color: black;
															
 
																+    padding: 5px;
															
 
																+}
															
 
																+
															
 
																+.function_table_header {
															
 
																+}
															
 
																+
															
 
																+
															
 
																+.function_table_header_entry {
															
 
																+    padding: 5px;
															
 
																+    border-bottom: solid 1px black;
															
 
																+    font-size: 1em;
															
 
																+    font-weight: bold;
															
 
																+    color: darkred;
															
 
																+    text-align: left;
															
 
																+}
															
 
																+
															
 
																+.function_entry {
															
 
																+}
															
 
																+
															
 
																+
															
 
																+.function_entry_simple {
															
 
																+    background: #87ff75;
															
 
																+}
															
 
																+
															
 
																+.function_entry_moderate {
															
 
																+    background: #fffc60;
															
 
																+}
															
 
																+
															
 
																+.function_entry_high {
															
 
																+    background: #ff5a5d;
															
 
																+}
															
 
																+
															
 
																+.function_entry_untestable {
															
 
																+    background: #993300
															
 
																+}
															
 
																+
															
 
																+
															
 
																+.function_entry_name {
															
 
																+    font-size: 1em;
															
 
																+    text-align: left;
															
 
																+    font-weight: bold;
															
 
																+    text-valign: top;
															
 
																+
															
 
																+    border-top: solid 1px black;
															
 
																+    padding: 3px;
															
 
																+}
															
 
																+
															
 
																+.function_entry_cyclo {
															
 
																+    font-size: 1em;
															
 
																+    text-align: right;
															
 
																+    text-valign: top;
															
 
																+
															
 
																+    border-top: solid 1px black;
															
 
																+    padding: 3px;
															
 
																+}
															
 
																+
															
 
																+.function_entry_number {
															
 
																+    font-size: 1em;
															
 
																+    text-align: right;
															
 
																+    text-valign: top;
															
 
																+
															
 
																+    border-top: solid 1px black;
															
 
																+    padding: 3px;
															
 
																+}
															
 
																+
															
 
																+.function_entry_filename {
															
 
																+    font-size: 1em;
															
 
																+    text-align: left;
															
 
																+    text-valign: top;
															
 
																+
															
 
																+    border-top: solid 1px black;
															
 
																+    padding: 3px;
															
 
																+}
															
--- a/build-aux/pmccabe2html
+++ b/build-aux/pmccabe2html
@@ -0,0 +1,907 @@
 
																+# pmccabe2html - AWK script to convert pmccabe output to html       -*- awk -*-
															
 
																+
															
 
																+# Copyright (C) 2007-2012 Free Software Foundation, Inc.
															
 
																+
															
 
																+# This program is free software: you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU General Public License as published by
															
 
																+# the Free Software Foundation, either version 3 of the License, or
															
 
																+# (at your option) any later version.
															
 
																+#
															
 
																+# This program is distributed in the hope that it will be useful,
															
 
																+# but WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
															
 
																+# GNU General Public License for more details.
															
 
																+#
															
 
																+# You should have received a copy of the GNU General Public License
															
 
																+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
															
 
																+
															
 
																+# Written by Jose E. Marchesi <jemarch@gnu.org>.
															
 
																+# Adapted for gnulib by Simon Josefsson <simon@josefsson.org>.
															
 
																+# Added support for C++ by Giuseppe Scrivano <gscrivano@gnu.org>.
															
 
																+
															
 
																+# Typical Invocation is from a Makefile.am:
															
 
																+#
															
 
																+# CYCLO_SOURCES = ${top_srcdir}/src/*.[ch]
															
 
																+#
															
 
																+# cyclo-$(PACKAGE).html: $(CYCLO_SOURCES)
															
 
																+# 	$(PMCCABE) $(CYCLO_SOURCES) \
															
 
																+# 		| sort -nr \
															
 
																+# 		| $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html \
															
 
																+# 			-v lang=html -v name="$(PACKAGE_NAME)" \
															
 
																+# 			-v vcurl="http://git.savannah.gnu.org/gitweb/?p=$(PACKAGE).git;a=blob;f=%FILENAME%;hb=HEAD" \
															
 
																+# 			-v url="http://www.gnu.org/software/$(PACKAGE)/" \
															
 
																+# 			-v css=${top_srcdir}/build-aux/pmccabe.css \
															
 
																+# 			-v cut_dir=${top_srcdir}/ \
															
 
																+# 			> $@-tmp
															
 
																+# 	mv $@-tmp $@
															
 
																+#
															
 
																+# The variables available are:
															
 
																+#   lang     output language, either 'html' or 'wiki'
															
 
																+#   name     project name
															
 
																+#   url      link to project's home page
															
 
																+#   vcurl    URL to version controlled source code browser,
															
 
																+#            a %FILENAME% in the string is replaced with the relative
															
 
																+#            source filename
															
 
																+#   css      CSS stylesheet filename, included verbatim in HTML output
															
 
																+#   css_url  link to CSS stylesheet, an URL
															
 
																+
															
 
																+# Prologue & configuration
															
 
																+BEGIN {
															
 
																+    section_global_stats_p = 1
															
 
																+    section_function_cyclo_p = 1
															
 
																+
															
 
																+    # "html" or "wiki"
															
 
																+    package_name = name
															
 
																+    output_lang = lang
															
 
																+
															
 
																+    # General Options
															
 
																+    cyclo_simple_max = 10
															
 
																+    cyclo_moderate_max = 20
															
 
																+    cyclo_high_max = 50
															
 
																+    source_file_link_tmpl = vcurl
															
 
																+
															
 
																+    # HTML options
															
 
																+    if (url != "")
															
 
																+    {
															
 
																+	html_prolog = "<a href=\"" url "\">Back to " package_name " Homepage</a><br/><br/>"
															
 
																+    }
															
 
																+    html_epilog = "<hr color=\"black\" size=\"2\"/> \
															
 
																+Copyright (c) 2007, 2008 Free Software Foundation, Inc."
															
 
																+    html_doctype = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \
															
 
																+\"http://www.w3.org/TR/html401/loose.dtd\">"
															
 
																+    html_comment = "<!-- Generated by gnulib's pmccabe2html at " systime() " -->"
															
 
																+    html_title = "Cyclomatic Complexity report for " package_name
															
 
																+
															
 
																+    # Wiki options
															
 
																+    wiki_prolog = "{{Note|This page has been automatically generated}}"
															
 
																+    wiki_epilog = ""
															
 
																+
															
 
																+    # Internal variables
															
 
																+    nfuncs = 0;
															
 
																+}
															
 
																+
															
 
																+# Functions
															
 
																+
															
 
																+function build_stats()
															
 
																+{
															
 
																+    # Maximum modified cyclo
															
 
																+    for (fcn in mcyclo)
															
 
																+    {
															
 
																+        num_of_functions++
															
 
																+        if (mcyclo[fcn] > max_mcyclo)
															
 
																+        {
															
 
																+            max_mcyclo = mcyclo[fcn]
															
 
																+        }
															
 
																+
															
 
																+        if (mcyclo[fcn] > cyclo_high_max)
															
 
																+        {
															
 
																+            num_of_untestable_functions++
															
 
																+        }
															
 
																+        else if (mcyclo[fcn] > cyclo_moderate_max)
															
 
																+        {
															
 
																+            num_of_high_functions++
															
 
																+        }
															
 
																+        else if (mcyclo[fcn] > cyclo_simple_max)
															
 
																+        {
															
 
																+            num_of_moderate_functions++
															
 
																+        }
															
 
																+        else
															
 
																+        {
															
 
																+            num_of_simple_functions++
															
 
																+        }
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+function html_fnc_table_complete (caption)
															
 
																+{
															
 
																+    html_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
															
 
																+}
															
 
																+
															
 
																+function html_fnc_table_abbrev (caption)
															
 
																+{
															
 
																+    html_fnc_table(caption, 1, 1, 0, 0, 1, 0, 0)
															
 
																+}
															
 
																+
															
 
																+
															
 
																+function html_fnc_table (caption,
															
 
																+                         fname_p,
															
 
																+                         mcyclo_p,
															
 
																+                         cyclo_p,
															
 
																+                         num_statements_p,
															
 
																+                         num_lines_p,
															
 
																+                         first_line_p,
															
 
																+                         file_p)
															
 
																+{
															
 
																+    print "<table width=\"90%\" class=\"function_table\" cellpadding=\"0\" cellspacing=\"0\">"
															
 
																+    if (caption != "")
															
 
																+    {
															
 
																+        print "<caption class=\"function_table_caption\">" caption "</caption>"
															
 
																+    }
															
 
																+    html_fnc_header(fname_p,
															
 
																+                    mcyclo_p,
															
 
																+                    cyclo_p,
															
 
																+                    num_statements_p,
															
 
																+                    num_lines_p,
															
 
																+                    first_line_p,
															
 
																+                    file_p)
															
 
																+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
															
 
																+    {
															
 
																+        html_fnc(nfnc,
															
 
																+                 fname_p,
															
 
																+                 mcyclo_p,
															
 
																+                 cyclo_p,
															
 
																+                 num_statements_p,
															
 
																+                 num_lines_p,
															
 
																+                 first_line_p,
															
 
																+                 file_p)
															
 
																+    }
															
 
																+    print "</table>"
															
 
																+}
															
 
																+
															
 
																+function html_header ()
															
 
																+{
															
 
																+    print html_doctype
															
 
																+    print "<html>"
															
 
																+    print html_comment
															
 
																+    print "<head>"
															
 
																+    print "<title>" html_title "</title>"
															
 
																+    print ""
															
 
																+    print "<meta name=\"description\" content=\"" html_title "\">"
															
 
																+    print "<meta name=\"keywords\" content=\"" html_title "\">"
															
 
																+    print "<meta name=\"resource-type\" content=\"document\">"
															
 
																+    print "<meta name=\"distribution\" content=\"global\">"
															
 
																+    print "<meta name=\"Generator\" content=\"pmccabe2html\">"
															
 
																+    print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"
															
 
																+    print "<script language=\"javascript\" type=\"text/javascript\">"
															
 
																+    print "function show_hide(idCapa, idButton, fuerzaVisibilidad)\
															
 
																+{\
															
 
																+        var button = document.getElementById(idButton);\
															
 
																+	var capa = document.getElementById(idCapa);\
															
 
																+	if (capa)\
															
 
																+	{\
															
 
																+		if (fuerzaVisibilidad && fuerzaVisibilidad!=\"\") {\
															
 
																+			if (fuerzaVisibilidad==\"visible\") capa.style.display=\"\";\
															
 
																+			else capa.style.display=\"none\";\
															
 
																+		}\
															
 
																+		else\
															
 
																+		{\
															
 
																+			if (capa.style.display == \"none\") {\
															
 
																+				capa.style.display = \"\";\
															
 
																+                                button.innerHTML = \"&uarr;\";\
															
 
																+			} else {\
															
 
																+				capa.style.display = \"none\";\
															
 
																+                                button.innerHTML = \"&darr;\";     \
															
 
																+			}\
															
 
																+		}\
															
 
																+	}\
															
 
																+}"
															
 
																+    print "</script>"
															
 
																+
															
 
																+
															
 
																+    if (css_url != "")
															
 
																+    {
															
 
																+        print "<link rel=\"stylesheet\" href=\"" css_url "\" type =\"text/css\" media=\"screen\"/>"
															
 
																+    }
															
 
																+    if (css != "")
															
 
																+    {
															
 
																+        print "<style type =\"text/css\" media=\"screen\">"
															
 
																+	print "<!--"
															
 
																+        while ((getline cssline < css) > 0)
															
 
																+        {
															
 
																+	    print cssline
															
 
																+	}
															
 
																+        print "-->"
															
 
																+	print "</style />"
															
 
																+	close(css)
															
 
																+    }
															
 
																+    print "</head>"
															
 
																+    print "<body lang=\"en\" bgcolor=\"#FFFFFF\" text=\"#000000\" link=\"#0000FF\" \
															
 
																+vlink=\"#800080\" alink=\"#FF0000\">"
															
 
																+}
															
 
																+
															
 
																+function html_footer ()
															
 
																+{
															
 
																+    print "</body>"
															
 
																+    print "</html>"
															
 
																+}
															
 
																+
															
 
																+function html_fnc_header (fname_p,
															
 
																+                          mcyclo_p,
															
 
																+                          cyclo_p,
															
 
																+                          num_statements_p,
															
 
																+                          num_lines_p,
															
 
																+                          first_line_p,
															
 
																+                          file_p)
															
 
																+{
															
 
																+    print "<tr class=\"function_table_header\">"
															
 
																+    if (fname_p)
															
 
																+    {
															
 
																+        # Function name
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print ""
															
 
																+        print "</td>"
															
 
																+
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Function Name"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (mcyclo_p)
															
 
																+    {
															
 
																+        # Modified cyclo
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Modified Cyclo"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (cyclo_p)
															
 
																+    {
															
 
																+        # Cyclo
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Cyclomatic"
															
 
																+        print "<br/>"
															
 
																+        print "Complexity"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (num_statements_p)
															
 
																+    {
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Number of"
															
 
																+        print "<br/>"
															
 
																+        print "Statements"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (num_lines_p)
															
 
																+    {
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Number of"
															
 
																+        print "<br/>"
															
 
																+        print "Lines"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (first_line_p)
															
 
																+    {
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "First Line"
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (file_p)
															
 
																+    {
															
 
																+        print "<td class=\"function_table_header_entry\">"
															
 
																+        print "Source File"
															
 
																+        print "</td>"
															
 
																+
															
 
																+    }
															
 
																+    print "</tr>"
															
 
																+}
															
 
																+
															
 
																+function html_fnc (nfun,
															
 
																+                   fname_p,
															
 
																+                   mcyclo_p,
															
 
																+                   cyclo_p,
															
 
																+                   num_statements_p,
															
 
																+                   num_lines_p,
															
 
																+                   first_line_p,
															
 
																+                   file_p)
															
 
																+{
															
 
																+    fname = fnames[nfun]
															
 
																+
															
 
																+    # Function name
															
 
																+    trclass = "function_entry_simple"
															
 
																+    if (mcyclo[nfun] > cyclo_high_max)
															
 
																+    {
															
 
																+        trclass="function_entry_untestable"
															
 
																+    }
															
 
																+    else if (mcyclo[nfun] > cyclo_moderate_max)
															
 
																+    {
															
 
																+        trclass="function_entry_high"
															
 
																+    }
															
 
																+    else if (mcyclo[nfun] > cyclo_simple_max)
															
 
																+    {
															
 
																+        trclass="function_entry_moderate"
															
 
																+    }
															
 
																+
															
 
																+    print "<tr class=\"" trclass "\">"
															
 
																+    if (fname_p)
															
 
																+    {
															
 
																+        print "<td class=\"function_entry_filename\">"
															
 
																+        if (file_p && mcyclo[nfun] > cyclo_simple_max)
															
 
																+        {
															
 
																+            print "<a href=\"javascript:void(0);\" title=\"show/hide function source\" onClick=\"javascript:show_hide('" fname "_src', '" fname "_button')\">\
															
 
																+<span id=\"" fname "_button\">&darr;</span></a>"
															
 
																+        }
															
 
																+        else
															
 
																+        {
															
 
																+            print "&nbsp;"
															
 
																+        }
															
 
																+        print "</td>"
															
 
																+
															
 
																+        print "<td class=\"function_entry_name\">"
															
 
																+        print fname
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (mcyclo_p)
															
 
																+    {
															
 
																+        # Modified cyclo
															
 
																+        print "<td class=\"function_entry_cyclo\">"
															
 
																+        print mcyclo[nfun]
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (cyclo_p)
															
 
																+    {
															
 
																+        # Cyclo
															
 
																+        print "<td class=\"function_entry_cyclo\">"
															
 
																+        print cyclo[nfun]
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (num_statements_p)
															
 
																+    {
															
 
																+        # Number of statements
															
 
																+        print "<td class=\"function_entry_number\">"
															
 
																+        print num_statements[nfun]
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (num_lines_p)
															
 
																+    {
															
 
																+        # Number of lines
															
 
																+        print "<td class=\"function_entry_number\">"
															
 
																+        print num_lines[nfun]
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (first_line_p)
															
 
																+    {
															
 
																+        # First line
															
 
																+        print "<td class=\"function_entry_number\">"
															
 
																+        print first_line[nfun]
															
 
																+        print "</td>"
															
 
																+    }
															
 
																+    if (file_p)
															
 
																+    {
															
 
																+        href = ""
															
 
																+        if (source_file_link_tmpl != "")
															
 
																+        {
															
 
																+            # Get href target
															
 
																+            href = source_file_link_tmpl
															
 
																+            sub(/%FILENAME%/, file[nfun], href)
															
 
																+        }
															
 
																+
															
 
																+        # Source file
															
 
																+        print "<td class=\"function_entry_filename\">"
															
 
																+        if (href != "")
															
 
																+        {
															
 
																+            print "<a href=\"" href "\">" file[nfun] "</a>"
															
 
																+        }
															
 
																+        else
															
 
																+        {
															
 
																+            print file[nfun]
															
 
																+        }
															
 
																+
															
 
																+        print "</td>"
															
 
																+
															
 
																+
															
 
																+        print "</tr>"
															
 
																+
															
 
																+        if (mcyclo[nfun] > cyclo_simple_max)
															
 
																+        {
															
 
																+            print "<tr>"
															
 
																+
															
 
																+            num_columns = 1;
															
 
																+            if (fname_p) { num_columns++ }
															
 
																+            if (mcyclo_p) { num_columns++ }
															
 
																+            if (cyclo_p) { num_columns++ }
															
 
																+            if (num_statements_p) { num_columns++ }
															
 
																+            if (num_lines_p) { num_columns++ }
															
 
																+            if (first_line_p) { num_columns++ }
															
 
																+            if (file_p) { num_columns++ }
															
 
																+
															
 
																+            print "<td colspan=\"" num_columns "\" height=\"0\">"
															
 
																+            print "<div id=\"" fname "_src\" class=\"function_src\" style=\"position: relative; display: none;\">"
															
 
																+            print "<pre class=\"function_src\">"
															
 
																+
															
 
																+            while ((getline codeline < (fname nfun "_fn.txt")) > 0)
															
 
																+            {
															
 
																+                sub(/\\</, "&lt;", codeline)
															
 
																+                sub(/\\>/, "&gt;", codeline)
															
 
																+                sub(/&/, "&amp;", codeline)
															
 
																+
															
 
																+                print codeline
															
 
																+            }
															
 
																+            close(fname nfun "_fn.txt")
															
 
																+            system("rm " "'" fname "'" nfun "_fn.txt")
															
 
																+            print "</pre>"
															
 
																+            print "</div>"
															
 
																+            print "</td>"
															
 
																+            print "</tr>"
															
 
																+        }
															
 
																+
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+function html_global_stats ()
															
 
																+{
															
 
																+    print "<div class=\"section_title\">Summary</div>"
															
 
																+
															
 
																+    print "<table class=\"summary_table\">"
															
 
																+    # Total number of functions
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"summary_header_entry\">"
															
 
																+    print "Total number of functions"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"summary_number_entry\">"
															
 
																+    print num_of_functions
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Number of simple functions
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"summary_header_entry\">"
															
 
																+    print "Number of low risk functions"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"summary_number_entry\">"
															
 
																+    print num_of_simple_functions
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Number of moderate functions
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"summary_header_entry\">"
															
 
																+    print "Number of moderate risk functions"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"summary_number_entry\">"
															
 
																+    print num_of_moderate_functions
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Number of high functions
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"summary_header_entry\">"
															
 
																+    print "Number of high risk functions"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"summary_number_entry\">"
															
 
																+    print num_of_high_functions
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Number of untestable functions
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"summary_header_entry\">"
															
 
																+    print "Number of untestable functions"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"summary_number_entry\">"
															
 
																+    print num_of_untestable_functions
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    print "</table>"
															
 
																+    print "<br/>"
															
 
																+}
															
 
																+
															
 
																+function html_function_cyclo ()
															
 
																+{
															
 
																+    print "<div class=\"section_title\">Details for all functions</div>"
															
 
																+
															
 
																+    print "<table class=\"ranges_table\">"
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"ranges_header_entry\">"
															
 
																+    print "&nbsp;"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_header_entry\">"
															
 
																+    print "Cyclomatic Complexity"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_header_entry\">"
															
 
																+    print "Risk Evaluation"
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Simple
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"ranges_entry_simple\">"
															
 
																+    print "&nbsp;"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "0 - " cyclo_simple_max
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "Simple module, without much risk"
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Moderate
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"ranges_entry_moderate\">"
															
 
																+    print "&nbsp;"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print cyclo_simple_max + 1 " - " cyclo_moderate_max
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "More complex module, moderate risk"
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # High
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"ranges_entry_high\">"
															
 
																+    print "&nbsp;"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print cyclo_moderate_max + 1 " - " cyclo_high_max
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "Complex module, high risk"
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    # Untestable
															
 
																+    print "<tr>"
															
 
																+    print "<td class=\"ranges_entry_untestable\">"
															
 
																+    print "&nbsp;"
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "greater than " cyclo_high_max
															
 
																+    print "</td>"
															
 
																+    print "<td class=\"ranges_entry\">"
															
 
																+    print "Untestable module, very high risk"
															
 
																+    print "</td>"
															
 
																+    print "</tr>"
															
 
																+    print "</table>"
															
 
																+    print "<br/>"
															
 
																+    html_fnc_table_complete("")
															
 
																+}
															
 
																+
															
 
																+function wiki_global_stats ()
															
 
																+{
															
 
																+    print "{| class=\"cyclo_summary_table\""
															
 
																+    # Total number of functions
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_summary_header_entry\" | Total number of functions"
															
 
																+    print "| class=\"cyclo_summary_number_entry\" |" num_of_functions
															
 
																+    # Number of simple functions
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_summary_header_entry\" | Number of low risk functions"
															
 
																+    print "| class=\"cyclo_summary_number_entry\" |" num_of_simple_functions
															
 
																+    # Number of moderate functions
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_summary_header_entry\" | Number of moderate risk functions"
															
 
																+    print "| class=\"cyclo_summary_number_entry\" |" num_of_moderate_functions
															
 
																+    # Number of high functions
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_summary_header_entry\" | Number of high risk functions"
															
 
																+    print "| class=\"cyclo_summary_number_entry\" |" num_of_high_functions
															
 
																+    # Number of untestable functions
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_summary_header_entry\" | Number of untestable functions"
															
 
																+    print "| class=\"cyclo_summary_number_entry\" |" num_of_untestable_functions
															
 
																+    print "|}"
															
 
																+}
															
 
																+
															
 
																+function wiki_function_cyclo ()
															
 
																+{
															
 
																+    print "==Details for all functions=="
															
 
																+
															
 
																+    print "Used ranges:"
															
 
																+
															
 
																+    print "{| class =\"cyclo_ranges_table\""
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_ranges_header_entry\" | "
															
 
																+    print "| class=\"cyclo_ranges_header_entry\" | Cyclomatic Complexity"
															
 
																+    print "| class=\"cyclo_ranges_header_entry\" | Risk Evaluation"
															
 
																+    # Simple
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_ranges_entry_simple\" | "
															
 
																+    print "| class=\"cyclo_ranges_entry\" | 0 - " cyclo_simple_max
															
 
																+    print "| class=\"cyclo_ranges_entry\" | Simple module, without much risk"
															
 
																+    # Moderate
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_ranges_entry_moderate\" | "
															
 
																+    print "| class=\"cyclo_ranges_entry\" |" cyclo_simple_max + 1 " - " cyclo_moderate_max
															
 
																+    print "| class=\"cyclo_ranges_entry\" | More complex module, moderate risk"
															
 
																+    # High
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_ranges_entry_high\" | "
															
 
																+    print "| class=\"cyclo_ranges_entry\" |" cyclo_moderate_max + 1 " - " cyclo_high_max
															
 
																+    print "| class=\"cyclo_ranges_entry\" | Complex module, high risk"
															
 
																+    # Untestable
															
 
																+    print "|-"
															
 
																+    print "| class=\"cyclo_ranges_entry_untestable\" | "
															
 
																+    print "| class=\"cyclo_ranges_entry\" | greater than " cyclo_high_max
															
 
																+    print "| class=\"cyclo_ranges_entry\" | Untestable module, very high risk"
															
 
																+    print "|}"
															
 
																+
															
 
																+    print ""
															
 
																+    print ""
															
 
																+    wiki_fnc_table_complete("")
															
 
																+}
															
 
																+
															
 
																+function wiki_fnc_table_complete (caption)
															
 
																+{
															
 
																+    wiki_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
															
 
																+}
															
 
																+
															
 
																+function wiki_fnc_table_abbrev (caption)
															
 
																+{
															
 
																+    wiki_fnc_table(caption, 1, 0, 0, 0, 0, 0, 0)
															
 
																+}
															
 
																+
															
 
																+function wiki_fnc_table (caption,
															
 
																+                         fname_p,
															
 
																+                         mcyclo_p,
															
 
																+                         cyclo_p,
															
 
																+                         num_statements_p,
															
 
																+                         num_lines_p,
															
 
																+                         first_line_p,
															
 
																+                         file_p)
															
 
																+{
															
 
																+    print "{| width=\"90%\" class=\"cyclo_function_table\" cellpadding=\"0\" cellspacing=\"0\">"
															
 
																+    if (caption != "")
															
 
																+    {
															
 
																+        print "|+" caption
															
 
																+    }
															
 
																+    wiki_fnc_header(fname_p,
															
 
																+                    mcyclo_p,
															
 
																+                    cyclo_p,
															
 
																+                    num_statements_p,
															
 
																+                    num_lines_p,
															
 
																+                    first_line_p,
															
 
																+                    file_p)
															
 
																+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
															
 
																+    {
															
 
																+        wiki_fnc(nfnc,
															
 
																+                 fname_p,
															
 
																+                 mcyclo_p,
															
 
																+                 cyclo_p,
															
 
																+                 num_statements_p,
															
 
																+                 num_lines_p,
															
 
																+                 first_line_p,
															
 
																+                 file_p)
															
 
																+    }
															
 
																+    print "|}"
															
 
																+}
															
 
																+
															
 
																+function wiki_fnc_header (fname_p,
															
 
																+                          mcyclo_p,
															
 
																+                          cyclo_p,
															
 
																+                          num_statements_p,
															
 
																+                          num_lines_p,
															
 
																+                          first_line_p,
															
 
																+                          file_p)
															
 
																+{
															
 
																+    if (fname_p)
															
 
																+    {
															
 
																+        # Function name
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Function Name"
															
 
																+    }
															
 
																+    if (mcyclo_p)
															
 
																+    {
															
 
																+        # Modified cyclo
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Modified Cyclo"
															
 
																+    }
															
 
																+    if (cyclo_p)
															
 
																+    {
															
 
																+        # Cyclo
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Cyclomatic Complexity"
															
 
																+    }
															
 
																+    if (num_statements_p)
															
 
																+    {
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Number of Statements"
															
 
																+    }
															
 
																+    if (num_lines_p)
															
 
																+    {
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Number of Lines"
															
 
																+    }
															
 
																+    if (first_line_p)
															
 
																+    {
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | First Line"
															
 
																+    }
															
 
																+    if (file_p)
															
 
																+    {
															
 
																+        print "! class=\"cyclo_function_table_header_entry\" | Source File"
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+function wiki_fnc (nfnc,
															
 
																+                   fname_p,
															
 
																+                   mcyclo_p,
															
 
																+                   cyclo_p,
															
 
																+                   num_statements_p,
															
 
																+                   num_lines_p,
															
 
																+                   first_line_p,
															
 
																+                   file_p)
															
 
																+{
															
 
																+   fname = fnames[nfnc]
															
 
																+
															
 
																+    # Function name
															
 
																+    trclass = "cyclo_function_entry_simple"
															
 
																+    if (mcyclo[nfnc] > cyclo_high_max)
															
 
																+    {
															
 
																+        trclass="cyclo_function_entry_untestable"
															
 
																+    }
															
 
																+    else if (mcyclo[nfnc] > cyclo_moderate_max)
															
 
																+    {
															
 
																+        trclass="cyclo_function_entry_high"
															
 
																+    }
															
 
																+    else if (mcyclo[nfnc] > cyclo_simple_max)
															
 
																+    {
															
 
																+        trclass="cyclo_function_entry_moderate"
															
 
																+    }
															
 
																+
															
 
																+    print "|- class=\"" trclass "\""
															
 
																+    if (fname_p)
															
 
																+    {
															
 
																+        print "| class=\"cyclo_function_entry_name\" |" fname
															
 
																+    }
															
 
																+    if (mcyclo_p)
															
 
																+    {
															
 
																+        # Modified cyclo
															
 
																+        print "| class=\"cyclo_function_entry_cyclo\" |" mcyclo[nfnc]
															
 
																+    }
															
 
																+    if (cyclo_p)
															
 
																+    {
															
 
																+        # Cyclo
															
 
																+        print "| class=\"cyclo_function_entry_cyclo\" |" cyclo[nfnc]
															
 
																+    }
															
 
																+    if (num_statements_p)
															
 
																+    {
															
 
																+        # Number of statements
															
 
																+        print "| class=\"cyclo_function_entry_number\" |" num_statements[nfnc]
															
 
																+    }
															
 
																+    if (num_lines_p)
															
 
																+    {
															
 
																+        # Number of lines
															
 
																+        print "| class=\"cyclo_function_entry_number\" |" num_lines[nfnc]
															
 
																+    }
															
 
																+    if (first_line_p)
															
 
																+    {
															
 
																+        # First line
															
 
																+        print "| class=\"cyclo_function_entry_number\" |" first_line[nfnc]
															
 
																+    }
															
 
																+    if (file_p)
															
 
																+    {
															
 
																+        href = ""
															
 
																+        if (source_file_link_tmpl != "")
															
 
																+        {
															
 
																+            # Get href target
															
 
																+            href = source_file_link_tmpl
															
 
																+            sub(/%FILENAME%/, file[nfnc], href)
															
 
																+        }
															
 
																+
															
 
																+        # Source file
															
 
																+        print "| class=\"cyclo_function_entry_filename\" |" \
															
 
																+            ((href != "") ? "[" href " " file[nfnc] "]" : "[" file[nfnc] "]")
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+# Scan data from a line
															
 
																+{
															
 
																+    function_name = $7
															
 
																+
															
 
																+    nfuncs++;
															
 
																+    fnames[nfuncs] = function_name
															
 
																+    mcyclo[nfuncs] = $1
															
 
																+    cyclo[nfuncs] = $2
															
 
																+    num_statements[nfuncs] = $3
															
 
																+    first_line[nfuncs] = $4
															
 
																+    num_lines[nfuncs] = $5
															
 
																+
															
 
																+    # Build the filename from the file_spec ($6)
															
 
																+    begin_util_path = index($6, cut_dir)
															
 
																+    tmpfilename = substr($6, begin_util_path + length(cut_dir))
															
 
																+    sub(/\([0-9]+\):/, "", tmpfilename)
															
 
																+    file[nfuncs] = tmpfilename
															
 
																+
															
 
																+    if (mcyclo[nfuncs] > cyclo_simple_max)
															
 
																+    {
															
 
																+        # Extract function contents to a fn_txt file
															
 
																+        filepath = $6
															
 
																+
															
 
																+        sub(/\([0-9]+\):/, "", filepath)
															
 
																+        num_line = 0
															
 
																+
															
 
																+        while ((getline codeline < filepath) > 0)
															
 
																+        {
															
 
																+            num_line++;
															
 
																+            if ((num_line >= first_line[nfuncs]) &&
															
 
																+                (num_line < first_line[nfuncs] + num_lines[nfuncs]))
															
 
																+            {
															
 
																+                print codeline > (function_name nfuncs "_fn.txt")
															
 
																+            }
															
 
																+        }
															
 
																+        close (function_name nfuncs "_fn.txt")
															
 
																+        close(filepath)
															
 
																+    }
															
 
																+
															
 
																+    # Initial values for statistics variables
															
 
																+    num_of_functions = 0
															
 
																+    max_mcyclo = 0
															
 
																+    max_function_length = 0
															
 
																+    num_of_simple_functions = 0
															
 
																+    num_of_moderate_functions = 0
															
 
																+    num_of_high_functions = 0
															
 
																+    num_of_untestable_functions = 0
															
 
																+}
															
 
																+
															
 
																+# Epilogue
															
 
																+END {
															
 
																+    # Print header (only for html)
															
 
																+    if (output_lang == "html")
															
 
																+    {
															
 
																+        html_header()
															
 
																+    }
															
 
																+
															
 
																+    # Print prolog
															
 
																+    if ((output_lang == "html") &&
															
 
																+        (html_prolog != ""))
															
 
																+    {
															
 
																+        print html_prolog
															
 
																+    }
															
 
																+    if ((output_lang == "wiki") &&
															
 
																+        (wiki_prolog != ""))
															
 
																+    {
															
 
																+        print wiki_prolog
															
 
																+    }
															
 
																+
															
 
																+    if (output_lang == "html")
															
 
																+    {
															
 
																+        print "<div class=\"page_title\">" package_name " Cyclomatic Complexity Report</div>"
															
 
																+        print "<p>Report generated at: <span class=\"report_timestamp\">" strftime() "</div></p>"
															
 
																+    }
															
 
																+    if (output_lang == "wiki")
															
 
																+    {
															
 
																+        print "==" package_name " Cyclomatic Complexity Report=="
															
 
																+        print "Report generated at: '''" strftime() "'''"
															
 
																+    }
															
 
																+
															
 
																+    if (section_global_stats_p)
															
 
																+    {
															
 
																+        build_stats()
															
 
																+
															
 
																+        if (output_lang == "html")
															
 
																+        {
															
 
																+            html_global_stats()
															
 
																+        }
															
 
																+        if (output_lang == "wiki")
															
 
																+        {
															
 
																+            wiki_global_stats()
															
 
																+        }
															
 
																+    }
															
 
																+    if (section_function_cyclo_p)
															
 
																+    {
															
 
																+        if (output_lang == "html")
															
 
																+        {
															
 
																+            html_function_cyclo()
															
 
																+        }
															
 
																+        if (output_lang == "wiki")
															
 
																+        {
															
 
																+            wiki_function_cyclo()
															
 
																+        }
															
 
																+    }
															
 
																+
															
 
																+    # Print epilog
															
 
																+    if ((output_lang == "html") &&
															
 
																+        (html_epilog != ""))
															
 
																+    {
															
 
																+        print html_epilog
															
 
																+    }
															
 
																+    if ((output_lang == "wiki") &&
															
 
																+        (wiki_epilog != ""))
															
 
																+    {
															
 
																+        print wiki_epilog
															
 
																+    }
															
 
																+
															
 
																+    # Print footer (html only)
															
 
																+    if (output_lang == "html")
															
 
																+    {
															
 
																+        html_footer()
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+# End of pmccabe2html
															
--- a/configure.ac
+++ b/configure.ac
@@ -16,7 +16,8 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AC_INIT([StarPU],1.1.0, [starpu-devel@lists.gforge.inria.fr], starpu)
															
 
																+AC_INIT([StarPU], [1.1.0], [starpu-devel@lists.gforge.inria.fr],
															
 
																+  [starpu], [http://runtime.bordeaux.inria.fr/StarPU/])
															
 
																 AC_CONFIG_SRCDIR(include/starpu.h)
															
 
																 AC_CONFIG_AUX_DIR([build-aux])
															
@@ -188,6 +189,7 @@ AC_ARG_ENABLE(quick-check, [AS_HELP_STRING([--enable-quick-check],
 
																 if  test x$enable_quick_check = xyes; then
															
 
																 	AC_DEFINE(STARPU_QUICK_CHECK, [1], [enable quick check])
															
 
																 fi
															
 
																+AM_CONDITIONAL([STARPU_QUICK_CHECK], [test "x$enable_quick_check" = "xyes"])
															
 
																 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
															
@@ -411,7 +413,12 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
																 	CPPFLAGS="${SAVED_CPPFLAGS}"
															
 
																 	unset STARPU_CUDA_LDFLAGS
															
 
																     else
															
 
																-	if test x$starpu_windows != xyes ; then
															
 
																+	# nvcc is a wrapper around GCC, and calls it with the -dumpspecs
															
 
																+	# option, which is GCC specific. If $CC does not support -dumpspecs, we
															
 
																+	# should let nvcc choose another compiler (by default, gcc, if it is
															
 
																+	# installed). If gcc is not installed, the build will probably fail.
															
 
																+	$CC -dumpspecs >/dev/null 2>&1
															
 
																+	if test $? -eq 0 -a x$starpu_windows != xyes; then
															
 
																 	    NVCCFLAGS="${NVCCFLAGS} -ccbin \${CC}"
															
 
																 	fi
															
 
																 	if test "$__cuda_include_dir" != "no"; then
															
@@ -803,8 +810,55 @@ AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices]
 
																 #                                                                             #
															
 
																 ###############################################################################
															
 
																+AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
															
 
																+				[Enable the use of an OpenCL simulator])],
															
 
																+				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
															
 
																+if test x$enable_opencl_simulator = xyes; then
															
 
																+	enable_simgrid=yes
															
 
																+	AC_DEFINE(STARPU_OPENCL_SIMULATOR, 1, [Define this to enable using an OpenCL simulator])
															
 
																+fi
															
 
																+
															
 
																+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
															
 
																+			[Enable simulating execution in simgrid])],
															
 
																+			enable_simgrid=$enableval, enable_simgrid=no)
															
 
																+if test x$enable_simgrid = xyes ; then
															
 
																+	OLD_CLAGS=$CFLAGS
															
 
																+	OLD_LDFLAGS=$LDFLAGS
															
 
																+	if test -n "$SIMGRID_CFLAGS" ; then
															
 
																+		CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
															
 
																+	fi
															
 
																+	if test -n "$SIMGRID_LIBS" ; then
															
 
																+		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
															
 
																+	fi
															
 
																+	AC_HAVE_LIBRARY([simgrid], [],
															
 
																+		[
															
 
																+			AC_MSG_ERROR(Simgrid support needs simgrid installed)
															
 
																+		]
															
 
																+	)
															
 
																+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
															
 
																+		    		[[#include <msg/msg.h>]],
															
 
																+				[[msg_host_t foo; ]]
															
 
																+			    )],
															
 
																+	                 [],
															
 
																+	                 [
															
 
																+			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
															
 
																+		         ])
															
 
																+	if test -z "$SIMGRID_LIBS" ; then
															
 
																+		SIMGRID_LIBS=-lsimgrid
															
 
																+	fi
															
 
																+	CLAGS=$OLD_CFLAGS
															
 
																+	LDFLAGS=$OLD_LDFLAGS
															
 
																+	AC_DEFINE(STARPU_SIMGRID, 1, [Define this to enable simgrid execution])
															
 
																+	# Avoid the starpu top thread compilation
															
 
																+	enable_starpu_top=no
															
 
																+	# We won't bind or detect anything
															
 
																+	with_hwloc=no
															
 
																+	# In simgrid, it's much better to let workers block than spinlock
															
 
																+	enable_blocking=yes
															
 
																+fi
															
 
																+
															
 
																 AC_MSG_CHECKING(whether blocking drivers should be disabled)
															
 
																-AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
															
 
																+AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
															
 
																 				enable_blocking=$enableval, enable_blocking=no)
															
 
																 AC_MSG_RESULT($enable_blocking)
															
@@ -826,7 +880,9 @@ AC_MSG_RESULT($enable_debug)
 
																 if test x$enable_debug = xyes; then
															
 
																 	CFLAGS="$CFLAGS -O0"
															
 
																-	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
															
 
																+	if test x$enable_simgrid != xyes; then
															
 
																+		AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
															
 
																+	fi
															
 
																 else
															
 
																 	CFLAGS="-O3 $CFLAGS"
															
 
																 fi
															
@@ -841,16 +897,6 @@ if test x$enable_fast = xyes; then
 
																 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
															
 
																 fi
															
 
																-AC_MSG_CHECKING(whether memory status should be displayed)
															
 
																-AC_ARG_ENABLE(memory-status, [AS_HELP_STRING([--enable-memory-status],
															
 
																-			     [display memory status at the end of execution])],
															
 
																-			     enable_memory_status=$enableval, enable_memory_status=no)
															
 
																-AC_MSG_RESULT($enable_memory_status)
															
 
																-if test x$enable_memory_status = xyes; then
															
 
																-        AC_DEFINE(STARPU_MEMORY_STATUS, [1], [display memory status])
															
 
																-fi
															
 
																-
															
 
																-
															
 
																 AC_MSG_CHECKING(whether debug messages should be displayed)
															
 
																 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
															
 
																 			[display verbose debug messages])],
															
@@ -860,7 +906,6 @@ if test x$enable_verbose = xyes; then
 
																 	AC_DEFINE(STARPU_VERBOSE, [1], [display verbose debug messages])
															
 
																 fi
															
 
																-
															
 
																 AC_MSG_CHECKING(whether coverage testing should be enabled)
															
 
																 AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
															
 
																 			[enable coverage checking])],
															
@@ -873,7 +918,6 @@ if test x$enable_coverage = xyes; then
 
																 	LDFLAGS="${LDFLAGS} --coverage"
															
 
																 fi
															
 
																-
															
 
																 # shall we use FxT to generate trace of the execution ?
															
 
																 AC_MSG_CHECKING(whether FxT traces should be generated)
															
 
																 AC_ARG_WITH(fxt, [AS_HELP_STRING([--with-fxt[=<dir>]], [generate fxt traces])],
															
@@ -957,15 +1001,25 @@ AC_ARG_ENABLE(stats, [AS_HELP_STRING([--enable-stats],
 
																 			enable_stats=$enableval, enable_stats=no)
															
 
																 AC_MSG_RESULT($enable_stats)
															
 
																 AC_SUBST(STATS, $enable_stats)
															
 
																-AC_SUBST(STARPU_DATA_STATS, $enable_stats)
															
 
																-
															
 
																+AC_SUBST(STARPU_ENABLE_STATS, $enable_stats)
															
 
																 if test x$enable_stats = xyes; then
															
 
																-        AC_DEFINE(STARPU_DATA_STATS, [1], [enable statistics])
															
 
																+        AC_DEFINE(STARPU_ENABLE_STATS, [1], [enable statistics])
															
 
																+fi
															
 
																+
															
 
																+AC_MSG_CHECKING(whether memory stats should be displayed)
															
 
																+AC_ARG_ENABLE(memory-stats, [AS_HELP_STRING([--enable-memory-stats],
															
 
																+			     [enable memory stats])],
															
 
																+			     enable_memory_stats=$enableval, enable_memory_stats=no)
															
 
																+AC_MSG_RESULT($enable_memory_stats)
															
 
																+if test x$enable_memory_stats = xyes; then
															
 
																+        AC_DEFINE(STARPU_MEMORY_STATS, [1], [enable memory stats])
															
 
																 fi
															
 
																 AC_CHECK_HEADERS([glpk.h])
															
 
																 STARPU_HAVE_LIBRARY(GLPK, [glpk])
															
 
																 AM_CONDITIONAL([STARPU_HAVE_GLPK], [test "x$build_sched_ctx_hypervisor" = "xyes"])
															
 
																+AC_CHECK_HEADERS([Ayudame.h])
															
 
																+
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																 #                  Miscellaneous options for StarPU                           #
															
@@ -1099,7 +1153,7 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
																 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
															
 
																 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
															
 
																 running_mpi_check=no
															
 
																-if test -d "$srcdir/.svn" ; then
															
 
																+if test -d "$srcdir/.svn" -o -d "$srcdir/.git" ; then
															
 
																     running_mpi_check=yes
															
 
																 fi
															
 
																 if test x$enable_mpi_check = xyes ; then
															
@@ -1721,16 +1775,9 @@ m4_ifdef([AM_SILENT_RULES],
 
																 # Documentation                          #
															
 
																 ##########################################
															
 
																-enable_build_doc=yes
															
 
																-AC_CHECK_PROGS([CHECK_TEXI2DVI], [texi2dvi], "no")
															
 
																-if test "$CHECK_TEXI2DVI" == "no" ; then
															
 
																-    enable_build_doc=no
															
 
																-else
															
 
																-    AC_CHECK_PROGS([CHECK_TEX], [tex], "no")
															
 
																-    if test "$CHECK_TEX" == "no" ; then
															
 
																-	enable_build_doc=no
															
 
																-    fi
															
 
																-fi
															
 
																+AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
															
 
																+			[disable building of documentation])],
															
 
																+			enable_build_doc=$enableval, enable_build_doc=yes)
															
 
																 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
															
 
																 ###############################################################################
															
@@ -1745,7 +1792,7 @@ AC_SUBST([LIBSTARPU_LDFLAGS])
 
																 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
															
 
																 if test x$enable_perf_debug = xyes; then
															
 
																-	LIBSTARPU_LINK=".libs/libstarpu-$STARPU_EFFECTIVE_VERSION.a $LIBSTARPU_LDFLAGS $HWLOC_LIBS $STARPU_CUDA_LDFLAGS $STARPU_OPENCL_LDFLAGS"
															
 
																+	LIBSTARPU_LINK=".libs/libstarpu-$STARPU_EFFECTIVE_VERSION.a $LIBSTARPU_LDFLAGS $HWLOC_LIBS $SIMGRID_LIBS $STARPU_CUDA_LDFLAGS $STARPU_OPENCL_LDFLAGS"
															
 
																 fi
															
 
																 AC_SUBST([LIBSTARPU_LINK])
															
@@ -1845,6 +1892,8 @@ AC_MSG_NOTICE([
 
																 	       SOCL enabled:  $build_socl
															
 
																                Scheduler Hypervisor: $build_sched_ctx_hypervisor
															
 
																                SOCL test suite: $run_socl_check
															
 
																+               simgrid enabled:                             $enable_simgrid
															
 
																+               ayudame enabled:                             $ac_cv_header_Ayudame_h
															
 
																 ])
															
 
																 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
															
@@ -1855,7 +1904,7 @@ To run the tests, you need to install the OCL implementation of ICD
 
																 and set the variable SOCL_OCL_LIB_OPENCL to the location of the libOpenCL.so.])
															
 
																 fi
															
 
																-if test x"$have_valid_hwloc" = xno
															
 
																+if test x"$have_valid_hwloc" = xno -a "$enable_simgrid" = "no"
															
 
																 then
															
 
																   AC_MSG_NOTICE([
															
 
																 WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
															
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -38,11 +38,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 
																 	chapters/version.texi \
															
 
																 	chapters/sched_ctx_hypervisor.texi
															
 
																-MAINTAINERCLEANFILES = starpu.pdf
															
 
																+MAINTAINERCLEANFILES = starpu.pdf starpu.html
															
 
																 EXTRA_DIST = starpu.css
															
 
																 dist_pdf_DATA = starpu.pdf
															
 
																+dist_html_DATA = starpu.html
															
 
																 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
															
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -7,15 +7,15 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																-* Defining a new data interface::  
															
 
																-* Multiformat Data Interface::  
															
 
																-* Task Bundles::                
															
 
																-* Task Lists::                  
															
 
																-* Using Parallel Tasks::       
															
 
																+* Defining a new data interface::
															
 
																+* Multiformat Data Interface::
															
 
																+* Task Bundles::
															
 
																+* Task Lists::
															
 
																+* Using Parallel Tasks::
															
 
																 * Scheduling Contexts::
															
 
																-* Defining a new scheduling policy::  
															
 
																+* Defining a new scheduling policy::
															
 
																 * Running drivers::
															
 
																-* Expert mode::                 
															
 
																+* Expert mode::
															
 
																 @end menu
															
 
																 @node Defining a new data interface
															
@@ -892,4 +892,3 @@ Register a progression hook, to be called when workers are idle.
 
																 @deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
															
 
																 Unregister a given progression hook.
															
 
																 @end deftypefun
															
 
																-
															
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -9,13 +9,13 @@
 
																 @menu
															
 
																 * Using multiple implementations of a codelet::
															
 
																 * Enabling implementation according to capabilities::
															
 
																-* Task and Worker Profiling::   
															
 
																+* Task and Worker Profiling::
															
 
																 * Partitioning Data::
															
 
																-* Performance model example::   
															
 
																-* Theoretical lower bound on execution time::  
															
 
																-* Insert Task Utility::          
															
 
																-* Data reduction::  
															
 
																-* Temporary buffers::  
															
 
																+* Performance model example::
															
 
																+* Theoretical lower bound on execution time::
															
 
																+* Insert Task Utility::
															
 
																+* Data reduction::
															
 
																+* Temporary buffers::
															
 
																 * Parallel Tasks::
															
 
																 * Debugging::
															
 
																 * The multiformat interface::
															
@@ -45,7 +45,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 
																     __m128 factor __attribute__((aligned(16)));
															
 
																     factor = _mm_set1_ps(*(float *) cl_arg);
															
 
																-    unsigned int i;    
															
 
																+    unsigned int i;
															
 
																     for (i = 0; i < n_iterations; i++)
															
 
																         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
															
 
																 @}
															
@@ -232,7 +232,7 @@ starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
 
																                             NX, sizeof(vector[0]));
															
 
																 /* Partition the vector in PARTS sub-vectors */
															
 
																-starpu_filter f =
															
 
																+starpu_data_filter f =
															
 
																 @{
															
 
																     .filter_func = starpu_block_filter_func_vector,
															
 
																     .nchildren = PARTS
															
@@ -456,8 +456,10 @@ solve it immediately and get the optimized minimum, in ms. Its @code{integer}
 
																 parameter allows to decide whether integer resolution should be computed
															
 
																 and returned too.
															
 
																-The @code{deps} parameter tells StarPU whether to take tasks and implicit data
															
 
																-dependencies into account. It must be understood that the linear programming
															
 
																+The @code{deps} parameter tells StarPU whether to take tasks, implicit data, and tag
															
 
																+dependencies into account. Tags released in a callback or similar
															
 
																+are not taken into account, only tags associated with a task are.
															
 
																+It must be understood that the linear programming
															
 
																 problem size is quadratic with the number of tasks and thus the time to solve it
															
 
																 will be very long, it could be minutes for just a few dozen tasks. You should
															
 
																 probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
															
@@ -469,6 +471,10 @@ of @code{lp_solve}. For instance, we often just use
 
																 @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
															
 
																 the @code{-gr} option can also be quite useful.
															
 
																+Data transfer time can only be taken into account when @code{deps} is set. Only
															
 
																+data transfers inferred from implicit data dependencies between tasks are taken
															
 
																+into account.
															
 
																+
															
 
																 Setting @code{deps} to 0 will only take into account the actual computations
															
 
																 on processing units. It however still properly takes into account the varying
															
 
																 performances of kernels and processing units, which is quite more accurate than
															
@@ -965,6 +971,8 @@ gdb helpers are also provided to show the whole StarPU state:
 
																 (gdb) help starpu
															
 
																 @end smallexample
															
 
																+The Temanejo task debugger can also be used, see @ref{Task debugger}.
															
 
																+
															
 
																 @node The multiformat interface
															
 
																 @section The multiformat interface
															
 
																 It may be interesting to represent the same piece of data using two different
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -161,17 +161,17 @@ it is therefore necessary to disable asynchronous data transfers.
 
																 This can also be specified at compilation time by giving to the
															
 
																 configure script the option @code{--disable-asynchronous-copy}.
															
 
																-@item @code{int disable_cuda_asynchronous_copy} (default = 0)
															
 
																+@item @code{int disable_asynchronous_cuda_copy} (default = 0)
															
 
																 This flag should be set to 1 to disable asynchronous copies between
															
 
																 CPUs and CUDA accelerators. This can also be specified with the
															
 
																-@code{STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY} environment variable.
															
 
																+@code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY} environment variable.
															
 
																 This can also be specified at compilation time by giving to the
															
 
																 configure script the option @code{--disable-asynchronous-cuda-copy}.
															
 
																-@item @code{int disable_opencl_asynchronous_copy} (default = 0)
															
 
																+@item @code{int disable_asynchronous_opencl_copy} (default = 0)
															
 
																 This flag should be set to 1 to disable asynchronous copies between
															
 
																 CPUs and OpenCL accelerators. This can also be specified with the
															
 
																-@code{STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY} environment variable.
															
 
																+@code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY} environment variable.
															
 
																 The AMD implementation of OpenCL is known to
															
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
@@ -592,6 +592,7 @@ available on the given memory node instead of main memory.
 
																 @menu
															
 
																 * Registering Data::
															
 
																 * Accessing Data Interfaces::
															
 
																+* Defining Interface::
															
 
																 @end menu
															
 
																 @node Registering Data
															
@@ -771,7 +772,8 @@ The function also sets @var{count} to the size of the data handle by calling
 
																 Unpack in @var{handle} the data located at @var{ptr} of size
															
 
																 @var{count} as described by the interface of the data. The interface
															
 
																 registered at @var{handle} must define a unpacking operation
															
 
																-(@pxref{struct starpu_data_interface_ops}).
															
 
																+(@pxref{struct starpu_data_interface_ops}). The memory at the address @code{ptr}
															
 
																+is freed after calling the data unpacking operation.
															
 
																 @end deftypefun
															
 
																 @node Accessing Variable Data Interfaces
															
@@ -1027,7 +1029,7 @@ Return a pointer to the non-zero values of the matrix designated by @var{interfa
 
																 @defmac STARPU_BCSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
															
 
																 Return a device handle for the array of non-zero values in the matrix designated
															
 
																-by @var{interface}. The offset documented below has to be used in addition to 
															
 
																+by @var{interface}. The offset documented below has to be used in addition to
															
 
																 this.
															
 
																 @end defmac
															
@@ -1102,7 +1104,7 @@ Return a pointer to the non-zero values of the matrix designated by @var{interfa
 
																 @defmac STARPU_CSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
															
 
																 Return a device handle for the array of non-zero values in the matrix designated
															
 
																-by @var{interface}. The offset documented below has to be used in addition to 
															
 
																+by @var{interface}. The offset documented below has to be used in addition to
															
 
																 this.
															
 
																 @end defmac
															
@@ -1189,6 +1191,21 @@ Return the size of the elements registered into the matrix designated by
 
																 @var{interface}.
															
 
																 @end defmac
															
 
																+@node Defining Interface
															
 
																+@subsection Defining Interface
															
 
																+
															
 
																+Applications can provide their own interface. An example is provided in
															
 
																+@code{examples/interface}. A few helpers are provided.
															
 
																+
															
 
																+@deftypefun uintptr_t starpu_allocate_buffer_on_node (uint32_t @var{dst_node}, size_t @var{size})
															
 
																+Allocate @var{size} bytes on node @var{dst_node}. This returns 0 if allocation
															
 
																+failed, the allocation method should then return -ENOMEM as allocated size.
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun void starpu_free_buffer_on_node (uint32_t @var{dst_node}, uintptr_t @var{data}, size_t @var{size})
															
 
																+Free @var{data} of @var{size} bytes on node @var{dst_node}.
															
 
																+@end deftypefun
															
 
																+
															
 
																 @node Data Partition
															
 
																 @section Data Partition
															
@@ -1327,7 +1344,7 @@ vector represented by @var{father_interface} once partitioned in
 
																 @deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
															
 
																 Return in @code{*@var{child_interface}} the @var{id}th element of the
															
 
																 vector represented by @var{father_interface} once partitioned in
															
 
																-@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow 
															
 
																+@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow
															
 
																 The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
															
@@ -1527,7 +1544,7 @@ e.g. static storage case.
 
																 @item @code{uint32_t where} (optional)
															
 
																 Indicates which types of processing units are able to execute the
															
 
																 codelet. The different values
															
 
																-@code{STARPU_CPU}, @code{STARPU_CUDA}, 
															
 
																+@code{STARPU_CPU}, @code{STARPU_CUDA},
															
 
																 @code{STARPU_OPENCL} can be combined to specify
															
 
																 on which types of processing units the codelet can be executed.
															
 
																 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
															
@@ -2405,11 +2422,21 @@ Converts the given timespec @var{ts} into microseconds.
 
																 @end deftypefun
															
 
																 @deftypefun void starpu_bus_profiling_helper_display_summary (void)
															
 
																-Displays statistics about the bus on stderr.
															
 
																+Displays statistics about the bus on stderr. if the  environment
															
 
																+variable @code{STARPU_BUS_STATS} is defined. The function is called
															
 
																+automatically by @code{starpu_shutdown()}.
															
 
																 @end deftypefun
															
 
																 @deftypefun void starpu_worker_profiling_helper_display_summary (void)
															
 
																-Displays statistics about the workers on stderr.
															
 
																+Displays statistics about the workers on stderr if the environment
															
 
																+variable @code{STARPU_WORKER_STATS} is defined. The function is called
															
 
																+automatically by @code{starpu_shutdown()}.
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun void starpu_display_memory_stats ()
															
 
																+Display statistics about the current data handles registered within
															
 
																+StarPU. StarPU must have been configured with the option
															
 
																+@code{----enable-memory-stats} (@pxref{Memory feedback}).
															
 
																 @end deftypefun
															
 
																 @node CUDA extensions
															
@@ -2568,7 +2595,8 @@ use (e.g. different programs on the different OpenCL devices, for
 
																 relocation purpose for instance).
															
 
																 @deftp {Data Type} {struct starpu_opencl_program}
															
 
																-Stores the OpenCL programs as compiled for the different OpenCL devices.
															
 
																+Stores the OpenCL programs as compiled for the different OpenCL
															
 
																+devices. The different fields are:
															
 
																 @table @asis
															
 
																 @item @code{cl_program programs[STARPU_MAXOPENCLDEVS]}
															
 
																 Stores each program for each OpenCL device.
															
@@ -2589,10 +2617,11 @@ This function unloads an OpenCL compiled code.
 
																 @end deftypefun
															
 
																 @deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
															
 
																+@anchor{starpu_opencl_load_program_source}
															
 
																 Store the contents of the file @var{source_file_name} in the buffer
															
 
																 @var{opencl_program_source}. The file @var{source_file_name} can be
															
 
																 located in the current directory, or in the directory specified by the
															
 
																-environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, or in the
															
 
																+environment variable @code{STARPU_OPENCL_PROGRAM_DIR} (@pxref{STARPU_OPENCL_PROGRAM_DIR}), or in the
															
 
																 directory @code{share/starpu/opencl} of the installation directory of
															
 
																 StarPU, or in the source directory of StarPU.
															
 
																 When the file is found, @code{located_file_name} is the full name of
															
@@ -2743,4 +2772,3 @@ This function blocks until the function has been executed on every appropriate
 
																 processing units, so that it may not be called from a callback function for
															
 
																 instance.
															
 
																 @end deftypefun
															
 
																-
															
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -7,10 +7,10 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																-* Compiling and linking options::  
															
 
																+* Compiling and linking options::
															
 
																 * Hello World::                 Submitting Tasks
															
 
																-* Vector Scaling Using the C Extension::  
															
 
																-* Vector Scaling Using StarPu's API::  
															
 
																+* Vector Scaling Using the C Extension::
															
 
																+* Vector Scaling Using StarPu's API::
															
 
																 * Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
															
 
																 @end menu
															
@@ -52,8 +52,8 @@ to StarPU. You can either use the StarPU C extension (@pxref{C
 
																 Extensions}) or directly use the StarPU's API.
															
 
																 @menu
															
 
																-* Hello World using the C Extension::  
															
 
																-* Hello World using StarPU's API::  
															
 
																+* Hello World using the C Extension::
															
 
																+* Hello World using StarPU's API::
															
 
																 @end menu
															
 
																 @node Hello World using the C Extension
															
@@ -116,10 +116,10 @@ The remainder of this section shows how to achieve the same result using
 
																 StarPU's standard C API.
															
 
																 @menu
															
 
																-* Required Headers::            
															
 
																-* Defining a Codelet::          
															
 
																-* Submitting a Task::           
															
 
																-* Execution of Hello World::    
															
 
																+* Required Headers::
															
 
																+* Defining a Codelet::
															
 
																+* Submitting a Task::
															
 
																+* Execution of Hello World::
															
 
																 @end menu
															
 
																 @node Required Headers
															
@@ -306,8 +306,8 @@ example using StarPU's API is given in the next sections.
 
																 @menu
															
 
																-* Adding an OpenCL Task Implementation::  
															
 
																-* Adding a CUDA Task Implementation::  
															
 
																+* Adding an OpenCL Task Implementation::
															
 
																+* Adding a CUDA Task Implementation::
															
 
																 @end menu
															
 
																 The simplest way to get started writing StarPU programs is using the C
															
@@ -576,7 +576,7 @@ this example is given in @ref{Full source code for the 'Scaling a
 
																 Vector' example}.
															
 
																 @menu
															
 
																-* Source Code of Vector Scaling::  
															
 
																+* Source Code of Vector Scaling::
															
 
																 * Execution of Vector Scaling::  Running the program
															
 
																 @end menu
															
@@ -701,10 +701,10 @@ Contrary to the previous examples, the task submitted in this example may not
 
																 only be executed by the CPUs, but also by a CUDA device.
															
 
																 @menu
															
 
																-* Definition of the CUDA Kernel::  
															
 
																-* Definition of the OpenCL Kernel::  
															
 
																-* Definition of the Main Code::  
															
 
																-* Execution of Hybrid Vector Scaling::  
															
 
																+* Definition of the CUDA Kernel::
															
 
																+* Definition of the OpenCL Kernel::
															
 
																+* Definition of the Main Code::
															
 
																+* Execution of Hybrid Vector Scaling::
															
 
																 @end menu
															
 
																 @node Definition of the CUDA Kernel
															
--- a/doc/chapters/benchmarks.texi
+++ b/doc/chapters/benchmarks.texi
@@ -6,7 +6,7 @@
 
																 @menu
															
 
																 * Task size overhead::           Overhead of tasks depending on their size
															
 
																-* Data transfer latency::        Latency of data transfers 
															
 
																+* Data transfer latency::        Latency of data transfers
															
 
																 * Gemm::                         Matrix-matrix multiplication
															
 
																 * Cholesky::                     Cholesky factorization
															
 
																 * LU::                           LU factorization
															
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -17,24 +17,28 @@
 
																 The following arguments can be given to the @code{configure} script.
															
 
																 @menu
															
 
																-* Common configuration::        
															
 
																-* Configuring workers::         
															
 
																-* Extension configuration::     
															
 
																-* Advanced configuration::      
															
 
																+* Common configuration::
															
 
																+* Configuring workers::
															
 
																+* Extension configuration::
															
 
																+* Advanced configuration::
															
 
																 @end menu
															
 
																 @node Common configuration
															
 
																 @subsection Common configuration
															
 
																-@table @code
															
 
																+@defvr {Configure option} --enable-debug
															
 
																+Enable debugging messages.
															
 
																+@end defvr
															
 
																-@item --enable-debug
															
 
																+@defvr {Configure option} --enable-debug
															
 
																 Enable debugging messages.
															
 
																+@end defvr
															
 
																-@item --enable-fast
															
 
																+@defvr {Configure option} --enable-fast
															
 
																 Disable assertion checks, which saves computation time.
															
 
																+@end defvr
															
 
																-@item --enable-verbose
															
 
																+@defvr {Configure option} --enable-verbose
															
 
																 Increase the verbosity of the debugging messages.  This can be disabled
															
 
																 at runtime by setting the environment variable @code{STARPU_SILENT} to
															
 
																 any value.
															
@@ -42,25 +46,35 @@ any value.
 
																 @smallexample
															
 
																 % STARPU_SILENT=1 ./vector_scal
															
 
																 @end smallexample
															
 
																+@end defvr
															
 
																-@item --enable-coverage
															
 
																+@defvr {Configure option} --enable-coverage
															
 
																 Enable flags for the @code{gcov} coverage tool.
															
 
																+@end defvr
															
 
																-@item --enable-quick-check
															
 
																+@defvr {Configure option} --enable-quick-check
															
 
																 Specify tests and examples should be run on a smaller data set, i.e
															
 
																 allowing a faster execution time
															
 
																+@end defvr
															
 
																-@item --with-hwloc
															
 
																+@defvr {Configure option} --with-hwloc
															
 
																 Specify hwloc should be used by StarPU. hwloc should be found by the
															
 
																 means of the tools @code{pkg-config}.
															
 
																+@end defvr
															
 
																-@item --with-hwloc=@var{prefix}
															
 
																+@defvr {Configure option} --with-hwloc=@var{prefix}
															
 
																 Specify hwloc should be used by StarPU. hwloc should be found in the
															
 
																 directory specified by @var{prefix}.
															
 
																+@end defvr
															
 
																-@item --without-hwloc
															
 
																+@defvr {Configure option} --without-hwloc
															
 
																 Specify hwloc should not be used by StarPU.
															
 
																-@end table
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Configure option} --disable-build-doc
															
 
																+Disable the creation of the documentation. This should be done on a
															
 
																+machine which does not have the tools @code{makeinfo} and @code{tex}.
															
 
																+@end defvr
															
 
																 Additionally, the @command{configure} script recognize many variables, which
															
 
																 can be listed by typing @code{./configure --help}. For example,
															
@@ -70,183 +84,227 @@ CUDA kernels.
 
																 @node Configuring workers
															
 
																 @subsection Configuring workers
															
 
																-@table @code
															
 
																-
															
 
																-@item --enable-maxcpus=@var{count}
															
 
																+@defvr {Configure option} --enable-maxcpus=@var{count}
															
 
																 Use at most @var{count} CPU cores.  This information is then
															
 
																 available as the @code{STARPU_MAXCPUS} macro.
															
 
																+@end defvr
															
 
																-@item --disable-cpu
															
 
																+@defvr {Configure option} --disable-cpu
															
 
																 Disable the use of CPUs of the machine. Only GPUs etc. will be used.
															
 
																+@end defvr
															
 
																-@item --enable-maxcudadev=@var{count}
															
 
																+@defvr {Configure option} --enable-maxcudadev=@var{count}
															
 
																 Use at most @var{count} CUDA devices.  This information is then
															
 
																 available as the @code{STARPU_MAXCUDADEVS} macro.
															
 
																+@end defvr
															
 
																-@item --disable-cuda
															
 
																+@defvr {Configure option} --disable-cuda
															
 
																 Disable the use of CUDA, even if a valid CUDA installation was detected.
															
 
																+@end defvr
															
 
																-@item --with-cuda-dir=@var{prefix}
															
 
																+@defvr {Configure option} --with-cuda-dir=@var{prefix}
															
 
																 Search for CUDA under @var{prefix}, which should notably contain
															
 
																 @file{include/cuda.h}.
															
 
																+@end defvr
															
 
																-@item --with-cuda-include-dir=@var{dir}
															
 
																+@defvr {Configure option} --with-cuda-include-dir=@var{dir}
															
 
																 Search for CUDA headers under @var{dir}, which should
															
 
																 notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
															
 
																 value given to @code{--with-cuda-dir}.
															
 
																+@end defvr
															
 
																-@item --with-cuda-lib-dir=@var{dir}
															
 
																+@defvr {Configure option} --with-cuda-lib-dir=@var{dir}
															
 
																 Search for CUDA libraries under @var{dir}, which should notably contain
															
 
																 the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
															
 
																 @code{/lib} appended to the value given to @code{--with-cuda-dir}.
															
 
																+@end defvr
															
 
																-@item --disable-cuda-memcpy-peer
															
 
																+@defvr {Configure option} --disable-cuda-memcpy-peer
															
 
																 Explicitly disable peer transfers when using CUDA 4.0.
															
 
																+@end defvr
															
 
																-@item --enable-maxopencldev=@var{count}
															
 
																+@defvr {Configure option} --enable-maxopencldev=@var{count}
															
 
																 Use at most @var{count} OpenCL devices.  This information is then
															
 
																 available as the @code{STARPU_MAXOPENCLDEVS} macro.
															
 
																+@end defvr
															
 
																-@item --disable-opencl
															
 
																+@defvr {Configure option} --disable-opencl
															
 
																 Disable the use of OpenCL, even if the SDK is detected.
															
 
																+@end defvr
															
 
																-@item --with-opencl-dir=@var{prefix}
															
 
																+@defvr {Configure option} --with-opencl-dir=@var{prefix}
															
 
																 Search for an OpenCL implementation under @var{prefix}, which should
															
 
																 notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
															
 
																 Mac OS).
															
 
																+@end defvr
															
 
																-@item --with-opencl-include-dir=@var{dir}
															
 
																+@defvr {Configure option} --with-opencl-include-dir=@var{dir}
															
 
																 Search for OpenCL headers under @var{dir}, which should notably contain
															
 
																 @file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
															
 
																 @code{/include} appended to the value given to @code{--with-opencl-dir}.
															
 
																+@end defvr
															
 
																-@item --with-opencl-lib-dir=@var{dir}
															
 
																+@defvr {Configure option} --with-opencl-lib-dir=@var{dir}
															
 
																 Search for an OpenCL library under @var{dir}, which should notably
															
 
																 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
															
 
																 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
															
 
																+@end defvr
															
 
																-@item --enable-maximplementations=@var{count}
															
 
																+@defvr {Configure option} --enable-opencl-simulator
															
 
																+Enable considering the provided OpenCL implementation as a simulator, i.e. use
															
 
																+the kernel duration returned by OpenCL profiling information as wallclock time
															
 
																+instead of the actual measured real time. This requires simgrid support.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Configure option} --enable-maximplementations=@var{count}
															
 
																 Allow for at most @var{count} codelet implementations for the same
															
 
																 target device.  This information is then available as the
															
 
																 @code{STARPU_MAXIMPLEMENTATIONS} macro.
															
 
																+@end defvr
															
 
																-@item ----enable-max-sched-ctxs=@var{count}
															
 
																+@defvr {Configure option} --enable-max-sched-ctxs=@var{count}
															
 
																 Allow for at most @var{count} scheduling contexts
															
 
																 This information is then available as the
															
 
																 @code{STARPU_NMAX_SCHED_CTXS} macro.
															
 
																+@end defvr
															
 
																-@item --disable-asynchronous-copy
															
 
																+@defvr {Configure option} --disable-asynchronous-copy
															
 
																 Disable asynchronous copies between CPU and GPU devices.
															
 
																 The AMD implementation of OpenCL is known to
															
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																+@end defvr
															
 
																-@item --disable-asynchronous-cuda-copy
															
 
																+@defvr {Configure option} --disable-asynchronous-cuda-copy
															
 
																 Disable asynchronous copies between CPU and CUDA devices.
															
 
																+@end defvr
															
 
																-@item --disable-asynchronous-opencl-copy
															
 
																+@defvr {Configure option} --disable-asynchronous-opencl-copy
															
 
																 Disable asynchronous copies between CPU and OpenCL devices.
															
 
																 The AMD implementation of OpenCL is known to
															
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																-@end table
															
 
																+@end defvr
															
 
																 @node Extension configuration
															
 
																 @subsection Extension configuration
															
 
																-@table @code
															
 
																-
															
 
																-@item --disable-socl
															
 
																+@defvr {Configure option} --disable-socl
															
 
																 Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
															
 
																 default, it is enabled when an OpenCL implementation is found.
															
 
																+@end defvr
															
 
																-@item --disable-starpu-top
															
 
																+@defvr {Configure option} --disable-starpu-top
															
 
																 Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
															
 
																 is enabled when the required dependencies are found.
															
 
																+@end defvr
															
 
																-@item --disable-gcc-extensions
															
 
																+@defvr {Configure option} --disable-gcc-extensions
															
 
																 Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
															
 
																 enabled when the GCC compiler provides a plug-in support.
															
 
																+@end defvr
															
 
																-@item --with-mpicc=@var{path}
															
 
																+@defvr {Configure option} --with-mpicc=@var{path}
															
 
																 Use the @command{mpicc} compiler at @var{path}, for starpumpi
															
 
																 (@pxref{StarPU MPI support}).
															
 
																-
															
 
																-@end table
															
 
																+@end defvr
															
 
																 @node Advanced configuration
															
 
																 @subsection Advanced configuration
															
 
																-@table @code
															
 
																-
															
 
																-@item --enable-perf-debug
															
 
																+@defvr {Configure option} --enable-perf-debug
															
 
																 Enable performance debugging through gprof.
															
 
																+@end defvr
															
 
																-@item --enable-model-debug
															
 
																+@defvr {Configure option} --enable-model-debug
															
 
																 Enable performance model debugging.
															
 
																+@end defvr
															
 
																-@item --enable-stats
															
 
																+@defvr {Configure option} --enable-stats
															
 
																 @c see ../../src/datawizard/datastats.c
															
 
																-Enable gathering of memory transfer statistics.
															
 
																+Enable gathering of various data statistics (@pxref{Data statistics}).
															
 
																+@end defvr
															
 
																-@item --enable-maxbuffers
															
 
																+@defvr {Configure option} --enable-maxbuffers
															
 
																 Define the maximum number of buffers that tasks will be able to take
															
 
																 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
															
 
																+@end defvr
															
 
																-@item --enable-allocation-cache
															
 
																+@defvr {Configure option} --enable-allocation-cache
															
 
																 Enable the use of a data allocation cache to avoid the cost of it with
															
 
																 CUDA. Still experimental.
															
 
																+@end defvr
															
 
																-@item --enable-opengl-render
															
 
																+@defvr {Configure option} --enable-opengl-render
															
 
																 Enable the use of OpenGL for the rendering of some examples.
															
 
																 @c TODO: rather default to enabled when detected
															
 
																+@end defvr
															
 
																-@item --enable-blas-lib
															
 
																+@defvr {Configure option} --enable-blas-lib
															
 
																 Specify the blas library to be used by some of the examples. The
															
 
																 library has to be 'atlas' or 'goto'.
															
 
																+@end defvr
															
 
																-@item --disable-starpufft
															
 
																+@defvr {Configure option} --disable-starpufft
															
 
																 Disable the build of libstarpufft, even if fftw or cuFFT is available.
															
 
																+@end defvr
															
 
																-@item --with-magma=@var{prefix}
															
 
																+@defvr {Configure option} --with-magma=@var{prefix}
															
 
																 Search for MAGMA under @var{prefix}.  @var{prefix} should notably
															
 
																 contain @file{include/magmablas.h}.
															
 
																+@end defvr
															
 
																-@item --with-fxt=@var{prefix}
															
 
																+@defvr {Configure option} --with-fxt=@var{prefix}
															
 
																 Search for FxT under @var{prefix}.
															
 
																 @url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
															
 
																 traces of scheduling events, which can then be rendered them using ViTE
															
 
																 (@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
															
 
																 notably contain @code{include/fxt/fxt.h}.
															
 
																+@end defvr
															
 
																-@item --with-perf-model-dir=@var{dir}
															
 
																+@defvr {Configure option} --with-perf-model-dir=@var{dir}
															
 
																 Store performance models under @var{dir}, instead of the current user's
															
 
																 home.
															
 
																+@end defvr
															
 
																-@item --with-goto-dir=@var{prefix}
															
 
																+@defvr {Configure option} --with-goto-dir=@var{prefix}
															
 
																 Search for GotoBLAS under @var{prefix}, which should notably contain @file{libgoto.so} or @file{libgoto2.so}.
															
 
																+@end defvr
															
 
																-@item --with-atlas-dir=@var{prefix}
															
 
																+@defvr {Configure option} --with-atlas-dir=@var{prefix}
															
 
																 Search for ATLAS under @var{prefix}, which should notably contain
															
 
																 @file{include/cblas.h}.
															
 
																+@end defvr
															
 
																-@item --with-mkl-cflags=@var{cflags}
															
 
																+@defvr {Configure option} --with-mkl-cflags=@var{cflags}
															
 
																 Use @var{cflags} to compile code that uses the MKL library.
															
 
																+@end defvr
															
 
																-@item --with-mkl-ldflags=@var{ldflags}
															
 
																+@defvr {Configure option} --with-mkl-ldflags=@var{ldflags}
															
 
																 Use @var{ldflags} when linking code that uses the MKL library.  Note
															
 
																 that the
															
 
																 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
															
 
																 MKL website} provides a script to determine the linking flags.
															
 
																+@end defvr
															
 
																-@item --disable-build-examples
															
 
																+@defvr {Configure option} --disable-build-examples
															
 
																 Disable the build of examples.
															
 
																+@end defvr
															
 
																+
															
 
																-@item --enable-sched-ctx-hypervisor
															
 
																-Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}). 
															
 
																+@defvr {Configure option} --enable-sched-ctx-hypervisor
															
 
																+Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}).
															
 
																 By default, it is disabled.
															
 
																+@end defvr
															
 
																-@end table
															
 
																+@defvr {Configure option} --enable-memory-stats
															
 
																+Enable memory statistics (@pxref{Memory feedback}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Configure option} --enable-simgrid
															
 
																+Enable simulation of execution in simgrid, to allow easy experimentation with
															
 
																+various numbers of cores and GPUs, or amount of memory, etc. Experimental.
															
 
																+@end defvr
															
 
																 @node Execution configuration through environment variables
															
 
																 @section Execution configuration through environment variables
															
@@ -261,28 +319,41 @@ By default, it is disabled.
 
																 @node Workers
															
 
																 @subsection Configuring workers
															
 
																-@table @code
															
 
																-
															
 
																-@item @code{STARPU_NCPU}
															
 
																-Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
															
 
																+@defvr {Environment variable} STARPU_NCPU
															
 
																+Specify the number of CPU workers (thus not including workers dedicated to control accelerators). Note that by default, StarPU will not allocate
															
 
																 more CPU workers than there are physical CPUs, and that some CPUs are used to control
															
 
																 the accelerators.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_NCUDA}
															
 
																+@defvr {Environment variable} STARPU_NCPUS
															
 
																+This variable is deprecated. You should use @code{STARPU_NCPU}.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_NCUDA
															
 
																 Specify the number of CUDA devices that StarPU can use. If
															
 
																 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
															
 
																 possible to select which CUDA devices should be used by the means of the
															
 
																 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
															
 
																 create as many CUDA workers as there are CUDA devices.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_NOPENCL}
															
 
																+@defvr {Environment variable} STARPU_NOPENCL
															
 
																 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_OPENCL_ON_CPUS
															
 
																+By default, the OpenCL driver only enables GPU and accelerator
															
 
																+devices. By setting the environment variable
															
 
																+@code{STARPU_OPENCL_ON_CPUS} to 1, the OpenCL driver will also enable
															
 
																+CPU devices.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_WORKERS_NOBIND}
															
 
																+@defvr {Environment variable} STARPU_WORKERS_NOBIND
															
 
																 Setting it to non-zero will prevent StarPU from binding its threads to
															
 
																 CPUs. This is for instance useful when running the testsuite in parallel.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_WORKERS_CPUID}
															
 
																+@defvr {Environment variable} STARPU_WORKERS_CPUID
															
 
																 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
															
 
																 specifies on which logical CPU the different workers should be
															
 
																 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
															
@@ -305,8 +376,9 @@ third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
 
																 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
															
 
																 @code{starpu_conf} structure passed to @code{starpu_init} is set.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_WORKERS_CUDAID}
															
 
																+@defvr {Environment variable} STARPU_WORKERS_CUDAID
															
 
																 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
															
 
																 possible to select which CUDA devices should be used by StarPU. On a machine
															
 
																 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
															
@@ -316,20 +388,22 @@ the one reported by CUDA).
 
																 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
															
 
																 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_WORKERS_OPENCLID}
															
 
																+@defvr {Environment variable} STARPU_WORKERS_OPENCLID
															
 
																 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
															
 
																 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
															
 
																 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_SINGLE_COMBINED_WORKER}
															
 
																+@defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
															
 
																 If set, StarPU will create several workers which won't be able to work
															
 
																 concurrently. It will create combined workers which size goes from 1 to the
															
 
																 total number of CPU workers in the system.
															
 
																+@end defvr
															
 
																-@item @code{SYNTHESIZE_ARITY_COMBINED_WORKER}
															
 
																-
															
 
																+@defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
															
 
																 Let the user decide how many elements are allowed between combined workers
															
 
																 created from hwloc information. For instance, in the case of sockets with 6
															
 
																 cores without shared L2 caches, if @code{SYNTHESIZE_ARITY_COMBINED_WORKER} is
															
@@ -344,51 +418,55 @@ is already a normal worker for it).
 
																 The default, 2, thus makes StarPU tend to building a binary trees of combined
															
 
																 workers.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_DISABLE_ASYNCHRONOUS_COPY}
															
 
																+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_COPY
															
 
																 Disable asynchronous copies between CPU and GPU devices.
															
 
																 The AMD implementation of OpenCL is known to
															
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY}
															
 
																+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
															
 
																 Disable asynchronous copies between CPU and CUDA devices.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY}
															
 
																+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
															
 
																 Disable asynchronous copies between CPU and OpenCL devices.
															
 
																 The AMD implementation of OpenCL is known to
															
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_DISABLE_CUDA_GPU_GPU_DIRECT}
															
 
																+@defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
															
 
																 Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
															
 
																 instead. This permits to test the performance effect of GPU-Direct.
															
 
																-
															
 
																-@end table
															
 
																+@end defvr
															
 
																 @node Scheduling
															
 
																 @subsection Configuring the Scheduling engine
															
 
																-@table @code
															
 
																-
															
 
																-@item @code{STARPU_SCHED}
															
 
																+@defvr {Environment variable} STARPU_SCHED
															
 
																 Choose between the different scheduling policies proposed by StarPU: work
															
 
																 random, stealing, greedy, with performance models, etc.
															
 
																 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_CALIBRATE}
															
 
																+@defvr {Environment variable} STARPU_CALIBRATE
															
 
																 If this variable is set to 1, the performance models are calibrated during
															
 
																 the execution. If it is set to 2, the previous values are dropped to restart
															
 
																 calibration from scratch. Setting this variable to 0 disable calibration, this
															
 
																 is the default behaviour.
															
 
																 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_BUS_CALIBRATE}
															
 
																+@defvr {Environment variable} STARPU_BUS_CALIBRATE
															
 
																 If this variable is set to 1, the bus is recalibrated during intialization.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_PREFETCH}
															
 
																+@defvr {Environment variable} STARPU_PREFETCH
															
 
																 @anchor{STARPU_PREFETCH}
															
 
																 This variable indicates whether data prefetching should be enabled (0 means
															
 
																 that it is disabled). If prefetching is enabled, when a task is scheduled to be
															
@@ -396,68 +474,115 @@ executed e.g. on a GPU, StarPU will request an asynchronous transfer in
 
																 advance, so that data is already present on the GPU when the task starts. As a
															
 
																 result, computation and data transfers are overlapped.
															
 
																 Note that prefetching is enabled by default in StarPU.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_SCHED_ALPHA}
															
 
																+@defvr {Environment variable} STARPU_SCHED_ALPHA
															
 
																 To estimate the cost of a task StarPU takes into account the estimated
															
 
																 computation time (obtained thanks to performance models). The alpha factor is
															
 
																 the coefficient to be applied to it before adding it to the communication part.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_SCHED_BETA}
															
 
																+@defvr {Environment variable} STARPU_SCHED_BETA
															
 
																 To estimate the cost of a task StarPU takes into account the estimated
															
 
																 data transfer time (obtained thanks to performance models). The beta factor is
															
 
																 the coefficient to be applied to it before adding it to the computation part.
															
 
																+@end defvr
															
 
																-@end table
															
 
																+@defvr {Environment variable} STARPU_SCHED_GAMMA
															
 
																+Define the execution time penalty of a joule (@pxref{Power-based scheduling}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_IDLE_POWER
															
 
																+Define the idle power of the machine (@pxref{Power-based scheduling}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_PROFILING
															
 
																+Enable on-line performance monitoring (@pxref{Enabling on-line performance monitoring}).
															
 
																+@end defvr
															
 
																 @node Extensions
															
 
																 @subsection Extensions
															
 
																-@table @code
															
 
																-
															
 
																-@item @code{SOCL_OCL_LIB_OPENCL}
															
 
																+@defvr {Environment variable} SOCL_OCL_LIB_OPENCL
															
 
																 THE SOCL test suite is only run when the environment variable
															
 
																 @code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
															
 
																 of the libOpenCL.so file of the OCL ICD implementation.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_COMM_STATS}
															
 
																+@defvr {Environment variable} STARPU_COMM_STATS
															
 
																 @anchor{STARPU_COMM_STATS}
															
 
																 Communication statistics for starpumpi (@pxref{StarPU MPI support})
															
 
																 will be enabled when the environment variable @code{STARPU_COMM_STATS}
															
 
																 is defined to an value other than 0.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_MPI_CACHE}
															
 
																+@defvr {Environment variable} STARPU_MPI_CACHE
															
 
																 @anchor{STARPU_MPI_CACHE}
															
 
																 Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
															
 
																 disabled when the environment variable @code{STARPU_MPI_CACHE} is set
															
 
																 to 0. It is enabled by default or for any other values of the variable
															
 
																 @code{STARPU_MPI_CACHE}.
															
 
																-@end table
															
 
																+@end defvr
															
 
																 @node Misc
															
 
																 @subsection Miscellaneous and debug
															
 
																-@table @code
															
 
																-
															
 
																-@item @code{STARPU_SILENT}
															
 
																+@defvr {Environment variable} STARPU_OPENCL_PROGRAM_DIR
															
 
																+@anchor{STARPU_OPENCL_PROGRAM_DIR}
															
 
																+This specifies the directory where the OpenCL codelet source files are
															
 
																+located. The function @ref{starpu_opencl_load_program_source} looks
															
 
																+for the codelet in the current directory, in the directory specified
															
 
																+by the environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, in the
															
 
																+directory @code{share/starpu/opencl} of the installation directory of
															
 
																+StarPU, and finally in the source directory of StarPU.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_SILENT
															
 
																 This variable allows to disable verbose mode at runtime when StarPU
															
 
																 has been configured with the option @code{--enable-verbose}. It also
															
 
																 disables the display of StarPU information and warning messages.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_LOGFILENAME}
															
 
																+@defvr {Environment variable} STARPU_LOGFILENAME
															
 
																 This variable specifies in which file the debugging output should be saved to.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_FXT_PREFIX}
															
 
																+@defvr {Environment variable} STARPU_FXT_PREFIX
															
 
																 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_LIMIT_GPU_MEM}
															
 
																+@defvr {Environment variable} STARPU_LIMIT_GPU_MEM
															
 
																 This variable specifies the maximum number of megabytes that should be
															
 
																 available to the application on each GPUs. In case this value is smaller than
															
 
																 the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
															
 
																 on the device. This variable is intended to be used for experimental purposes
															
 
																 as it emulates devices that have a limited amount of memory.
															
 
																+@end defvr
															
 
																-@item @code{STARPU_GENERATE_TRACE}
															
 
																+@defvr {Environment variable} STARPU_GENERATE_TRACE
															
 
																 When set to 1, this variable indicates that StarPU should automatically
															
 
																-generate a Paje trace when starpu_shutdown is called.
															
 
																-
															
 
																-@end table
															
 
																+generate a Paje trace when @code{starpu_shutdown()} is called.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_MEMORY_STATS
															
 
																+When set to 0, disable the display of memory statistics on data which
															
 
																+have not been unregistered at the end of the execution (@pxref{Memory
															
 
																+feedback}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_BUS_STATS
															
 
																+When defined, statistics about data transfers will be displayed when calling
															
 
																+@code{starpu_shutdown()} (@pxref{Profiling}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_WORKER_STATS
															
 
																+When defined, statistics about the workers will be displayed when calling
															
 
																+@code{starpu_shutdown()} (@pxref{Profiling}). When combined with the
															
 
																+environment variable @code{STARPU_PROFILING}, it displays the power
															
 
																+consumption (@pxref{Power-based scheduling}).
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} STARPU_STATS
															
 
																+When set to 0, data statistics will not be displayed at the
															
 
																+end of the execution of an application (@pxref{Data statistics}).
															
 
																+@end defvr
															
--- a/doc/chapters/fdl-1.3.texi
+++ b/doc/chapters/fdl-1.3.texi
@@ -505,4 +505,3 @@ to permit their use in free software.
 
																 @c Local Variables:
															
 
																 @c ispell-local-pdict: "ispell-dict"
															
 
																 @c End:
															
 
																-
															
--- a/doc/chapters/installing.texi
+++ b/doc/chapters/installing.texi
@@ -7,9 +7,9 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																-* Downloading StarPU::          
															
 
																-* Configuration of StarPU::     
															
 
																-* Building and Installing StarPU::  
															
 
																+* Downloading StarPU::
															
 
																+* Configuration of StarPU::
															
 
																+* Building and Installing StarPU::
															
 
																 @end menu
															
 
																 StarPU can be built and installed by the standard means of the GNU
															
@@ -20,8 +20,8 @@ can be used to install StarPU.
 
																 @section Downloading StarPU
															
 
																 @menu
															
 
																-* Getting Sources::             
															
 
																-* Optional dependencies::       
															
 
																+* Getting Sources::
															
 
																+* Optional dependencies::
															
 
																 @end menu
															
 
																 @node Getting Sources
															
@@ -69,8 +69,8 @@ of hwloc.
 
																 @section Configuration of StarPU
															
 
																 @menu
															
 
																-* Generating Makefiles and configuration scripts::  
															
 
																-* Running the configuration::   
															
 
																+* Generating Makefiles and configuration scripts::
															
 
																+* Running the configuration::
															
 
																 @end menu
															
 
																 @node Generating Makefiles and configuration scripts
															
@@ -99,9 +99,9 @@ Details about options that are useful to give to @code{./configure} are given in
 
																 @section Building and Installing StarPU
															
 
																 @menu
															
 
																-* Building::                    
															
 
																-* Sanity Checks::               
															
 
																-* Installing::                  
															
 
																+* Building::
															
 
																+* Sanity Checks::
															
 
																+* Installing::
															
 
																 @end menu
															
 
																 @node Building
															
--- a/doc/chapters/introduction.texi
+++ b/doc/chapters/introduction.texi
@@ -70,8 +70,8 @@ policies in a portable fashion (@pxref{Scheduling Policy API}).
 
																 The remainder of this section describes the main concepts used in StarPU.
															
 
																 @menu
															
 
																-* Codelet and Tasks::           
															
 
																-* StarPU Data Management Library::  
															
 
																+* Codelet and Tasks::
															
 
																+* StarPU Data Management Library::
															
 
																 * Glossary::
															
 
																 * Research Papers::
															
 
																 @end menu
															
--- a/doc/chapters/mpi-support.texi
+++ b/doc/chapters/mpi-support.texi
@@ -20,16 +20,24 @@ distributed application, by automatically issuing all required data transfers
 
																 according to the task graph and an application-provided distribution.
															
 
																 @menu
															
 
																-* The API::                     
															
 
																-* Simple Example::              
															
 
																-* Exchanging User Defined Data Interface::  
															
 
																-* MPI Insert Task Utility::     
															
 
																-* MPI Collective Operations::   
															
 
																+* The API::
															
 
																+* Simple Example::
															
 
																+* Exchanging User Defined Data Interface::
															
 
																+* MPI Insert Task Utility::
															
 
																+* MPI Collective Operations::
															
 
																 @end menu
															
 
																 @node The API
															
 
																 @section The API
															
 
																+@menu
															
 
																+* Compilation::
															
 
																+* Initialisation::
															
 
																+* Communication::
															
 
																+* Communication cache::
															
 
																+@end menu
															
 
																+
															
 
																+@node Compilation
															
 
																 @subsection Compilation
															
 
																 The flags required to compile or link against the MPI layer are then
															
@@ -42,21 +50,27 @@ accessible with the following commands:
 
																 Also pass the @code{--static} option if the application is to be linked statically.
															
 
																+@node Initialisation
															
 
																 @subsection Initialisation
															
 
																-@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv})
															
 
																-Initializes the starpumpi library. If MPI is not already initialized,
															
 
																-it will be by calling @code{MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)}.
															
 
																+@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv}, int initialize_mpi)
															
 
																+Initializes the starpumpi library. @code{initialize_mpi} indicates if
															
 
																+MPI should be initialized or not by StarPU. If the value is not @code{0},
															
 
																+MPI will be initialized by calling @code{MPI_Init_Thread(argc, argv,
															
 
																+MPI_THREAD_SERIALIZED, ...)}.
															
 
																 @end deftypefun
															
 
																 @deftypefun int starpu_mpi_initialize (void)
															
 
																 This function has been made deprecated. One should use instead the
															
 
																 function @code{starpu_mpi_init()} defined above.
															
 
																+This function does not call @code{MPI_Init}, it should be called beforehand.
															
 
																 @end deftypefun
															
 
																 @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
															
 
																 This function has been made deprecated. One should use instead the
															
 
																 function @code{starpu_mpi_init()} defined above.
															
 
																+MPI will be initialized by starpumpi by calling @code{MPI_Init_Thread(argc, argv,
															
 
																+MPI_THREAD_SERIALIZED, ...)}.
															
 
																 @end deftypefun
															
 
																 @deftypefun int starpu_mpi_shutdown (void)
															
@@ -73,6 +87,7 @@ to the world size. Communications statistics must be enabled
 
																 (@pxref{STARPU_COMM_STATS}).
															
 
																 @end deftypefun
															
 
																+@node Communication
															
 
																 @subsection Communication
															
 
																 The standard point to point communications of MPI have been
															
@@ -165,6 +180,22 @@ node of the array @var{source} using the n-th message tag of the array
 
																 On completion of the all the requests, @var{tag} is unlocked.
															
 
																 @end deftypefun
															
 
																+@node Communication cache
															
 
																+@subsection Communication cache
															
 
																+
															
 
																+@deftypefun void starpu_mpi_cache_flush (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle})
															
 
																+Clear the send and receive communication cache for the data
															
 
																+@var{data_handle}. The function has to be called synchronously by all
															
 
																+the MPI nodes.
															
 
																+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun void starpu_mpi_cache_flush_all_data (MPI_Comm @var{comm})
															
 
																+Clear the send and receive communication cache for all data. The
															
 
																+function has to be called synchronously by all the MPI nodes.
															
 
																+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
															
 
																+@end deftypefun
															
 
																+
															
 
																 @page
															
 
																 @node Simple Example
															
 
																 @section Simple Example
															
@@ -561,5 +592,3 @@ for(x = 0; x < nblocks ;  x++) @{
 
																 starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
															
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																-
															
 
																-
															
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -7,17 +7,38 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																+* Task debugger::               Using the Temanejo task debugger
															
 
																 * On-line::                     On-line performance feedback
															
 
																 * Off-line::                    Off-line performance feedback
															
 
																 * Codelet performance::         Performance of codelets
															
 
																-* Theoretical lower bound on execution time API::  
															
 
																+* Theoretical lower bound on execution time API::
															
 
																+* Memory feedback::
															
 
																+* Data statistics::
															
 
																 @end menu
															
 
																+@node Task debugger
															
 
																+@section Using the Temanejo task debugger
															
 
																+
															
 
																+StarPU can connect to Temanejo (see
															
 
																+@url{http://www.hlrs.de/temanejo}), to permit
															
 
																+nice visual task debugging. To do so, build Temanejo's @code{libayudame.so},
															
 
																+install @code{Ayudame} to e.g. @code{/usr/local/include}, apply the
															
 
																+@code{tools/patch-ayudame} to it to fix C build, re-@code{./configure}, make
															
 
																+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
															
 
																+to your application, any options you want to pass it, the path to libayudame.so.
															
 
																+
															
 
																+Make sure to specify at least the same number of CPUs in the dialog box as your
															
 
																+machine has, otherwise an error will happen during execution. Future versions
															
 
																+of Temanejo should be able to tell StarPU the number of CPUs to use.
															
 
																+
															
 
																+Tag numbers have to be below @code{4000000000000000000ULL} to be usable for
															
 
																+Temanejo (so as to distinguish them from tasks).
															
 
																+
															
 
																 @node On-line
															
 
																 @section On-line performance feedback
															
 
																 @menu
															
 
																-* Enabling monitoring::         Enabling on-line performance monitoring
															
 
																+* Enabling on-line performance monitoring::
															
 
																 * Task feedback::               Per-task feedback
															
 
																 * Codelet feedback::            Per-codelet feedback
															
 
																 * Worker feedback::             Per-worker feedback
															
@@ -25,7 +46,7 @@
 
																 * StarPU-Top::                  StarPU-Top interface
															
 
																 @end menu
															
 
																-@node Enabling monitoring
															
 
																+@node Enabling on-line performance monitoring
															
 
																 @subsection Enabling on-line performance monitoring
															
 
																 In order to enable online performance monitoring, the application can call
															
@@ -87,7 +108,7 @@ because there is no task to execute at all (@code{sleeping_time}), and the
 
																 number of tasks that were executed while profiling was enabled.
															
 
																 These values give an estimation of the proportion of time spent do real work,
															
 
																 and the time spent either sleeping because there are not enough executable
															
 
																-tasks or simply wasted in pure StarPU overhead. 
															
 
																+tasks or simply wasted in pure StarPU overhead.
															
 
																 Calling @code{starpu_worker_get_profiling_info} resets the profiling
															
 
																 information associated to a worker.
															
@@ -98,7 +119,7 @@ generate a graphic showing the evolution of these values during the time, for
 
																 the different workers.
															
 
																 @node Bus feedback
															
 
																-@subsection Bus-related feedback 
															
 
																+@subsection Bus-related feedback
															
 
																 TODO: ajouter STARPU_BUS_STATS
															
@@ -433,3 +454,108 @@ Emit statistics of actual execution vs theoretical upper bound. @var{integer}
 
																 permits to choose between integer solving (which takes a long time but is
															
 
																 correct), and relaxed solving (which provides an approximate solution).
															
 
																 @end deftypefun
															
 
																+
															
 
																+@node Memory feedback
															
 
																+@section Memory feedback
															
 
																+
															
 
																+It is possible to enable memory statistics. To do so, you need to pass the option
															
 
																+@code{--enable-memory-stats} when running configure. It is then
															
 
																+possible to call the function @code{starpu_display_memory_stats()} to
															
 
																+display statistics about the current data handles registered within StarPU.
															
 
																+
															
 
																+Moreover, statistics will be displayed at the end of the execution on
															
 
																+data handles which have not been cleared out. This can be disabled by
															
 
																+setting the environment variable @code{STARPU_MEMORY_STATS} to 0.
															
 
																+
															
 
																+For example, if you do not unregister data at the end of the complex
															
 
																+example, you will get something similar to:
															
 
																+
															
 
																+@example
															
 
																+$ STARPU_MEMORY_STATS=0 ./examples/interface/complex
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+Complex[0] = 78.00 + 78.00 i
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+@end example
															
 
																+
															
 
																+@example
															
 
																+$ STARPU_MEMORY_STATS=1 ./examples/interface/complex
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+Complex[0] = 78.00 + 78.00 i
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+Complex[0] = 45.00 + 12.00 i
															
 
																+
															
 
																+#---------------------
															
 
																+Memory stats:
															
 
																+#-------
															
 
																+Data on Node #3
															
 
																+#-----
															
 
																+Data : 0x553ff40
															
 
																+Size : 16
															
 
																+
															
 
																+#--
															
 
																+Data access stats
															
 
																+/!\ Work Underway
															
 
																+Node #0
															
 
																+	Direct access : 4
															
 
																+	Loaded (Owner) : 0
															
 
																+	Loaded (Shared) : 0
															
 
																+	Invalidated (was Owner) : 0
															
 
																+
															
 
																+Node #3
															
 
																+	Direct access : 0
															
 
																+	Loaded (Owner) : 0
															
 
																+	Loaded (Shared) : 1
															
 
																+	Invalidated (was Owner) : 0
															
 
																+
															
 
																+#-----
															
 
																+Data : 0x5544710
															
 
																+Size : 16
															
 
																+
															
 
																+#--
															
 
																+Data access stats
															
 
																+/!\ Work Underway
															
 
																+Node #0
															
 
																+	Direct access : 2
															
 
																+	Loaded (Owner) : 0
															
 
																+	Loaded (Shared) : 1
															
 
																+	Invalidated (was Owner) : 1
															
 
																+
															
 
																+Node #3
															
 
																+	Direct access : 0
															
 
																+	Loaded (Owner) : 1
															
 
																+	Loaded (Shared) : 0
															
 
																+	Invalidated (was Owner) : 0
															
 
																+@end example
															
 
																+
															
 
																+@node Data statistics
															
 
																+@section Data statistics
															
 
																+
															
 
																+Different data statistics can be displayed at the end of the execution
															
 
																+of the application. To enable them, you need to pass the option
															
 
																+@code{--enable-stats} when calling @code{configure}. When calling
															
 
																+@code{starpu_shutdown()} various statistics will be displayed,
															
 
																+execution, MSI cache statistics, allocation cache statistics, and data
															
 
																+transfer statistics. The display can be disabled by setting the
															
 
																+environment variable @code{STARPU_STATS} to 0.
															
 
																+
															
 
																+@example
															
 
																+$ ./examples/cholesky/cholesky_tag
															
 
																+Computation took (in ms)
															
 
																+518.16
															
 
																+Synthetic GFlops : 44.21
															
 
																+#---------------------
															
 
																+MSI cache stats :
															
 
																+TOTAL MSI stats	hit 1622 (66.23 %)	miss 827 (33.77 %)
															
 
																+...
															
 
																+@end example
															
 
																+
															
 
																+@example
															
 
																+$ STARPU_STATS=0 ./examples/cholesky/cholesky_tag
															
 
																+Computation took (in ms)
															
 
																+518.16
															
 
																+Synthetic GFlops : 44.21
															
 
																+@end example
															
 
																+
															
 
																+@c TODO: data transfer stats are similar to the ones displayed when
															
 
																+@c setting STARPU_BUS_STATS
															
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -22,6 +22,7 @@ TODO: improve!
 
																 * Profiling::
															
 
																 * CUDA-specific optimizations::
															
 
																 * Performance debugging::
															
 
																+* Simulated performance::
															
 
																 @end menu
															
 
																 Simply encapsulating application kernels into tasks already permits to
															
@@ -122,10 +123,14 @@ only when another task writes some value to the handle.
 
																 Like any other runtime, StarPU has some overhead to manage tasks. Since
															
 
																 it does smart scheduling and data management, that overhead is not always
															
 
																 neglectable. The order of magnitude of the overhead is typically a couple of
															
 
																-microseconds. The amount of work that a task should do should thus be somewhat
															
 
																+microseconds, which is actually quite smaller than the CUDA overhead itself. The
															
 
																+amount of work that a task should do should thus be somewhat
															
 
																 bigger, to make sure that the overhead becomes neglectible. The offline
															
 
																 performance feedback can provide a measure of task length, which should thus be
															
 
																-checked if bad performance are observed.
															
 
																+checked if bad performance are observed. To get a grasp at the scalability
															
 
																+possibility according to task size, one can run
															
 
																+@code{tests/microbenchs/tasks_size_overhead.sh} which draws curves of the
															
 
																+speedup of independent tasks of very small sizes.
															
 
																 @node Task submission
															
 
																 @section Task submission
															
@@ -265,7 +270,7 @@ A graph can be drawn by using the @code{starpu_perfmodel_plot}:
 
																 @example
															
 
																 $ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
															
 
																-98304 393216 1572864 
															
 
																+98304 393216 1572864
															
 
																 $ gnuplot starpu_starpu_dlu_lu_model_22.gp
															
 
																 $ gv starpu_starpu_dlu_lu_model_22.eps
															
 
																 @end example
															
@@ -394,12 +399,12 @@ with these manual measurements through @code{starpu_perfmodel_update_history}.
 
																 @node Profiling
															
 
																 @section Profiling
															
 
																-A quick view of how many tasks each worker has executed can be obtained by setting 
															
 
																+A quick view of how many tasks each worker has executed can be obtained by setting
															
 
																 @code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
															
 
																 execution did happen on accelerators without penalizing performance with
															
 
																 the profiling overhead.
															
 
																-A quick view of how much data transfers have been issued can be obtained by setting 
															
 
																+A quick view of how much data transfers have been issued can be obtained by setting
															
 
																 @code{export STARPU_BUS_STATS=1} .
															
 
																 More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
															
@@ -457,3 +462,76 @@ detailed in the next chapter. The various informations should be checked for.
 
																   greedy algorithm which thus performs badly.
															
 
																 @end itemize
															
 
																 @end itemize
															
 
																+
															
 
																+You can also use the Temanejo task debugger (see @ref{Task debugger}) to
															
 
																+visualize the task graph more easily.
															
 
																+
															
 
																+@node Simulated performance
															
 
																+@section Simulated performance
															
 
																+
															
 
																+StarPU can use Simgrid in order to simulate execution on an arbitrary
															
 
																+platform. The idea is to first compile StarPU normally, and run the application,
															
 
																+so as to automatically benchmark the bus and the codelets.
															
 
																+
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+$ ./configure && make
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+[starpu][_starpu_load_history_based_model] Warning: model matvecmult is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.
															
 
																+$ ...
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+TEST PASSED
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																+
															
 
																+Note that we force to use the dmda scheduler to generate performance
															
 
																+models for the application. The application may need to be run several
															
 
																+times before the model is calibrated.
															
 
																+
															
 
																+Then, recompile StarPU, passing @code{--enable-simgrid} to @code{./configure}, and re-run the
															
 
																+application, specifying the requested number of devices:
															
 
																+
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+$ ./configure --enable-simgrid && make
															
 
																+$ STARPU_SCHED=dmda STARPU_NCPU=12 STARPU_NCUDA=0 STARPU_NOPENCL=1 ./examples/matvecmult/matvecmult
															
 
																+TEST FAILED !!!
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																+
															
 
																+It is normal that the test fails: since the computation are not actually done
															
 
																+(that is the whole point of simgrid), the result is wrong, of course.
															
 
																+
															
 
																+If the performance model is not calibrated enough, the following error
															
 
																+message will be displayed
															
 
																+
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+$ STARPU_SCHED=dmda STARPU_NCPU=12 STARPU_NCUDA=0 STARPU_NOPENCL=1 ./examples/matvecmult/matvecmult
															
 
																+[0.000000] [xbt_cfg/INFO] type in variable = 2
															
 
																+[0.000000] [surf_workstation/INFO] surf_workstation_model_init_ptask_L07
															
 
																+[starpu][_starpu_load_history_based_model] Warning: model matvecmult is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.
															
 
																+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet matvecmult does not have a perfmodel, or is not calibrated enough
															
 
																+$
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																+
															
 
																+For now, only the number of cpus can be arbitrarily chosen. The number of CUDA
															
 
																+and OpenCL devices have to be lower than the real number on the current machine.
															
 
																+
															
 
																+The Simgrid default stack size is small, to increase it use the
															
 
																+parameter @code{--cfg=contexts/stack_size}, for example:
															
 
																+
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+$ STARPU_NCPU=12 STARPU_NCUDA=2 STARPU_NOPENCL=0 ./example --cfg=contexts/stack_size:8192
															
 
																+[0.000000] [xbt_cfg/INFO] type in variable = 2
															
 
																+[0.000000] [surf_workstation/INFO] surf_workstation_model_init_ptask_L07
															
 
																+TEST FAILED !!!
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																+
															
 
																+Note: of course, if the application uses @code{gettimeofday} to make its
															
 
																+performance measurements, the real time will be used, which will be bogus. To
															
 
																+get the simulated time, it has to use @code{starpu_timing_now} which returns the
															
 
																+virtual timestamp in ms.
															
--- a/doc/chapters/scaling-vector-example.texi
+++ b/doc/chapters/scaling-vector-example.texi
@@ -7,10 +7,10 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																-* Main application::            
															
 
																-* CPU Kernel::                 
															
 
																-* CUDA Kernel::                
															
 
																-* OpenCL Kernel::              
															
 
																+* Main application::
															
 
																+* CPU Kernel::
															
 
																+* CUDA Kernel::
															
 
																+* OpenCL Kernel::
															
 
																 @end menu
															
 
																 @node Main application
															
@@ -32,8 +32,8 @@
 
																 @section OpenCL Kernel
															
 
																 @menu
															
 
																-* Invoking the kernel::         
															
 
																-* Source of the kernel::        
															
 
																+* Invoking the kernel::
															
 
																+* Source of the kernel::
															
 
																 @end menu
															
 
																 @node Invoking the kernel
															
@@ -45,4 +45,3 @@
 
																 @subsection Source of the kernel
															
 
																 @include chapters/vector_scal_opencl_codelet.texi
															
 
																-
															
--- a/doc/chapters/using.texi
+++ b/doc/chapters/using.texi
@@ -7,8 +7,8 @@
 
																 @c See the file starpu.texi for copying conditions.
															
 
																 @menu
															
 
																-* Setting flags for compiling and linking applications::  
															
 
																-* Running a basic StarPU application::  
															
 
																+* Setting flags for compiling and linking applications::
															
 
																+* Running a basic StarPU application::
															
 
																 * Kernel threads started by StarPU::
															
 
																 * Enabling OpenCL::
															
 
																 @end menu
															
@@ -111,4 +111,3 @@ so:
 
																 @example
															
 
																 % STARPU_NCUDA=2 ./application
															
 
																 @end example
															
 
																-
															
--- a/doc/chapters/vector_scal_cpu.texi
+++ b/doc/chapters/vector_scal_cpu.texi
@@ -51,7 +51,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 
																     float factor = *(float *) cl_arg;
															
 
																     FACTOR = _mm_set1_ps(factor);
															
 
																-    unsigned int i;	
															
 
																+    unsigned int i;
															
 
																     for (i = 0; i < n_iterations; i++)
															
 
																         VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
															
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -82,12 +82,13 @@ was last updated on @value{UPDATED}.
 
																 * StarPU Basic API::            	The Basic API to use StarPU
															
 
																 * StarPU Advanced API::         	Advanced use of StarPU
															
 
																 * Configuring StarPU::          	How to configure StarPU
															
 
																-* Full source code for the 'Scaling a Vector' example::  
															
 
																+* Full source code for the 'Scaling a Vector' example::
															
 
																 * GNU Free Documentation License::  How you can copy and share this manual.
															
 
																 * Concept Index::               Index of programming concepts.
															
 
																 * Function Index::              Index of C functions.
															
 
																-* Datatype Index::              Index of C datatypes
															
 
																+* Datatype Index::              Index of C datatypes.
															
 
																+* Configuration Index::         Index of configuration options.
															
 
																 @end menu
															
 
																 @c ---------------------------------------------------------------------
															
@@ -264,4 +265,8 @@ was last updated on @value{UPDATED}.
 
																 @unnumbered Datatype Index
															
 
																 @printindex tp
															
 
																+@node Configuration Index
															
 
																+@unnumbered Configuration Index
															
 
																+@printindex vr
															
 
																+
															
 
																 @bye
															
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -23,7 +23,6 @@
 
																  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
															
 
																  */
															
 
																-#include <config.h>
															
 
																 #include <starpu.h>
															
 
																 #include <stdlib.h>
															
 
																 #include <stdio.h>
															
@@ -42,13 +41,13 @@ extern void scal_opencl_func(void *buffers[], void *_args);
 
																 static struct starpu_perfmodel vector_scal_model =
															
 
																 {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "vector_scale"
															
 
																+	.symbol = "vector_scal"
															
 
																 };
															
 
																 static struct starpu_perfmodel vector_scal_power_model =
															
 
																 {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "vector_scale_power"
															
 
																+	.symbol = "vector_scal_power"
															
 
																 };
															
 
																 static struct starpu_codelet cl =
															
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -35,7 +35,7 @@ extern void scal_cuda_func(void *buffers[], void *_args);
 
																 static struct starpu_perfmodel vector_scal_model =
															
 
																 {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "vector_scale_model"
															
 
																+	.symbol = "vector_scal_model"
															
 
																 };
															
 
																 static struct starpu_codelet cl =
															
--- a/examples/basic_examples/vector_scal_cpu.c
+++ b/examples/basic_examples/vector_scal_cpu.c
@@ -15,7 +15,7 @@
 
																  */
															
 
																 /*
															
 
																- * This example complements vector_scale.c: here we implement a CPU version.
															
 
																+ * This example complements vector_scal.c: here we implement a CPU version.
															
 
																  */
															
 
																 #include "vector_scal_cpu_template.h"
															
--- a/examples/basic_examples/vector_scal_cpu_icc.icc
+++ b/examples/basic_examples/vector_scal_cpu_icc.icc
@@ -15,7 +15,7 @@
 
																  */
															
 
																 /*
															
 
																- * This example complements vector_scale.c: here we implement a CPU version,
															
 
																+ * This example complements vector_scal.c: here we implement a CPU version,
															
 
																  * meant to be compiled by icc.
															
 
																  */
															
--- a/examples/basic_examples/vector_scal_cpu_template.h
+++ b/examples/basic_examples/vector_scal_cpu_template.h
@@ -15,7 +15,7 @@
 
																  */
															
 
																 /*
															
 
																- * This example complements vector_scale.c: here we implement a CPU version.
															
 
																+ * This example complements vector_scal.c: here we implement a CPU version.
															
 
																  */
															
 
																 #ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__
															
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -16,7 +16,7 @@
 
																  */
															
 
																 /*
															
 
																- * This example complements vector_scale.c: here we implement a CUDA version.
															
 
																+ * This example complements vector_scal.c: here we implement a CUDA version.
															
 
																  */
															
 
																 #include <starpu.h>
															
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -17,7 +17,7 @@
 
																  */
															
 
																 /*
															
 
																- * This example complements vector_scale.c: here we implement a OpenCL version.
															
 
																+ * This example complements vector_scal.c: here we implement a OpenCL version.
															
 
																  */
															
 
																 #include <starpu.h>
															
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -58,10 +58,12 @@
 
																 static unsigned size = 4*1024;
															
 
																 static unsigned nblocks = 16;
															
 
																 static unsigned nbigblocks = 8;
															
 
																-static unsigned pinned = 0;
															
 
																+static unsigned pinned = 1;
															
 
																 static unsigned noprio = 0;
															
 
																 static unsigned check = 0;
															
 
																 static unsigned bound = 0;
															
 
																+static unsigned bound_deps = 0;
															
 
																+static unsigned bound_lp = 0;
															
 
																 static unsigned with_ctxs = 0;
															
 
																 static unsigned with_noctxs = 0;
															
 
																 static unsigned chole1 = 0;
															
@@ -127,9 +129,9 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
																 			nbigblocks = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																-		if (strcmp(argv[i], "-pin") == 0)
															
 
																+		if (strcmp(argv[i], "-no-pin") == 0)
															
 
																 		{
															
 
																-			pinned = 1;
															
 
																+			pinned = 0;
															
 
																 		}
															
 
																 		if (strcmp(argv[i], "-no-prio") == 0)
															
@@ -142,6 +144,16 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
																 			bound = 1;
															
 
																 		}
															
 
																+		if (strcmp(argv[i], "-bound-lp") == 0)
															
 
																+		{
															
 
																+			bound_lp = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-bound-deps") == 0)
															
 
																+		{
															
 
																+			bound_deps = 1;
															
 
																+		}
															
 
																+
															
 
																 		if (strcmp(argv[i], "-check") == 0)
															
 
																 		{
															
 
																 			check = 1;
															
@@ -149,7 +161,7 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
																 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
															
 
																 		{
															
 
																-			fprintf(stderr,"usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
															
 
																+			fprintf(stderr,"usage : %s [-size size] [-nblocks nblocks] [-no-pin] [-no-prio] [-bound] [-bound-deps] [-bound-lp] [-check]\n", argv[0]);
															
 
																 			fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
															
 
																 		}
															
 
																 	}
															
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -288,6 +288,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	starpu_helper_cublas_init();
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	if (pinned)
															
 
																 	{
															
 
																 		starpu_malloc((void **)A, dim*dim*sizeof(float));
															
@@ -296,21 +297,22 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	{
															
 
																 		*A = malloc(dim*dim*sizeof(float));
															
 
																 	}
															
 
																+#endif
															
 
																 }
															
 
																 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
															
 
																 {
															
 
																-	struct timeval start;
															
 
																-	struct timeval end;
															
 
																+	double start;
															
 
																+	double end;
															
 
																 	int ret;
															
 
																-	gettimeofday(&start, NULL);
															
 
																+	start = starpu_timing_now();
															
 
																 	ret = cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
															
 
																-	gettimeofday(&end, NULL);
															
 
																+	end = starpu_timing_now();
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	double timing = end - start;
															
 
																 	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																 	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
@@ -345,9 +347,10 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																-	float *mat;
															
 
																+	float *mat = NULL;
															
 
																 	initialize_system(&mat, size, pinned);
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	unsigned i,j;
															
 
																 	for (i = 0; i < size; i++)
															
 
																 	{
															
@@ -357,6 +360,7 @@ int main(int argc, char **argv)
 
																 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																+#endif
															
 
																 #ifdef CHECK_OUTPUT
															
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -75,17 +75,17 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 
																 static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
															
 
																 {
															
 
																 	int ret;
															
 
																-	struct timeval start;
															
 
																-	struct timeval end;
															
 
																+	double start;
															
 
																+	double end;
															
 
																 	unsigned i,j,k;
															
 
																 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
															
 
																-	gettimeofday(&start, NULL);
															
 
																+	start = starpu_timing_now();
															
 
																 	if (bound)
															
 
																-		starpu_bound_start(0, 0);
															
 
																+		starpu_bound_start(bound_deps, 0);
															
 
																 	/* create all the DAG nodes */
															
 
																 	for (k = 0; k < nblocks; k++)
															
 
																 	{
															
@@ -135,10 +135,10 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 	if (bound)
															
 
																 		starpu_bound_stop();
															
 
																-	gettimeofday(&end, NULL);
															
 
																-
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	end = starpu_timing_now();
															
 
																+	//double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	double timing = end - start;
															
 
																 	unsigned long n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (1.0f*n*n*n)/3.0f;
															
@@ -151,6 +151,11 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 		FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+		if (bound_lp)
															
 
																+		{
															
 
																+			FILE *f = fopen("cholesky.lp", "w");
															
 
																+			starpu_bound_print_lp(f);
															
 
																+		}
															
 
																 		if (bound)
															
 
																 		{
															
 
																 			double res;
															
@@ -194,10 +199,11 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
																 static void execute_cholesky(unsigned size, unsigned nblocks)
															
 
																 {
															
 
																 	int ret;
															
 
																-	float *mat;
															
 
																-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
															
 
																-
															
 
																+	float *mat = NULL;
															
 
																 	unsigned i,j;
															
 
																+
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
															
 
																 	for (i = 0; i < size; i++)
															
 
																 	{
															
 
																 		for (j = 0; j < size; j++)
															
@@ -206,6 +212,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
																 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																+#endif
															
 
																 /* #define PRINT_OUTPUT */
															
 
																 #ifdef PRINT_OUTPUT
															
@@ -345,6 +352,7 @@ int main(int argc, char **argv)
 
																 		execute_cholesky(size, nblocks);
															
 
																 	starpu_helper_cublas_shutdown();
															
 
																+	starpu_free(mat);
															
 
																 	starpu_shutdown();
															
 
																 	return ret;
															
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -175,15 +175,15 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
																 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
															
 
																 {
															
 
																-	struct timeval start;
															
 
																-	struct timeval end;
															
 
																+	double start;
															
 
																+	double end;
															
 
																 	struct starpu_task *entry_task = NULL;
															
 
																 	/* create all the DAG nodes */
															
 
																 	unsigned i,j,k;
															
 
																-	gettimeofday(&start, NULL);
															
 
																+	start = starpu_timing_now();
															
 
																 	for (k = 0; k < nblocks; k++)
															
 
																 	{
															
@@ -230,10 +230,10 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 	starpu_data_unpartition(dataA, 0);
															
 
																-	gettimeofday(&end, NULL);
															
 
																+	end = starpu_timing_now();
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	double timing = end - start;
															
 
																 	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																 	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
@@ -254,6 +254,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	starpu_helper_cublas_init();
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	if (pinned)
															
 
																 	{
															
 
																 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
															
@@ -262,6 +263,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	{
															
 
																 		*A = malloc(dim*dim*sizeof(float));
															
 
																 	}
															
 
																+#endif
															
 
																 	return 0;
															
 
																 }
															
@@ -318,10 +320,11 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																-	float *mat;
															
 
																+	float *mat = NULL;
															
 
																 	int ret = initialize_system(&mat, size, pinned);
															
 
																 	if (ret) return ret;
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	unsigned i,j;
															
 
																 	for (i = 0; i < size; i++)
															
 
																 	{
															
@@ -331,6 +334,7 @@ int main(int argc, char **argv)
 
																 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																+#endif
															
 
																 #ifdef CHECK_OUTPUT
															
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -195,8 +195,8 @@ static int cholesky_no_stride(void)
 
																 {
															
 
																 	int ret;
															
 
																-	struct timeval start;
															
 
																-	struct timeval end;
															
 
																+	double start;
															
 
																+	double end;
															
 
																 	struct starpu_task *entry_task = NULL;
															
@@ -234,7 +234,7 @@ static int cholesky_no_stride(void)
 
																 	}
															
 
																 	/* schedule the codelet */
															
 
																-	gettimeofday(&start, NULL);
															
 
																+	start = starpu_timing_now();
															
 
																 	ret = starpu_task_submit(entry_task);
															
 
																 	if (ret == -ENODEV) return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
@@ -242,9 +242,9 @@ static int cholesky_no_stride(void)
 
																 	/* stall the application until the end of computations */
															
 
																 	starpu_tag_wait(TAG11(nblocks-1));
															
 
																-	gettimeofday(&end, NULL);
															
 
																+	end = starpu_timing_now();
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	double timing = end - start;
															
 
																 	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																 	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
@@ -257,7 +257,6 @@ static int cholesky_no_stride(void)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	unsigned x, y;
															
 
																-	unsigned i, j;
															
 
																 	int ret;
															
 
																 	parse_args(argc, argv);
															
@@ -275,6 +274,7 @@ int main(int argc, char **argv)
 
																 	starpu_helper_cublas_init();
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	for (y = 0; y < nblocks; y++)
															
 
																 	for (x = 0; x < nblocks; x++)
															
 
																 	{
															
@@ -297,6 +297,7 @@ int main(int argc, char **argv)
 
																 	for (x = 0; x < nblocks; x++)
															
 
																 	if (x <= y)
															
 
																 	{
															
 
																+		unsigned i, j;
															
 
																 		for (i = 0; i < BLOCKSIZE; i++)
															
 
																 		for (j = 0; j < BLOCKSIZE; j++)
															
 
																 		{
															
@@ -308,6 +309,7 @@ int main(int argc, char **argv)
 
																 				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
															
 
																 		}
															
 
																 	}
															
 
																+#endif
															
 
																 	for (y = 0; y < nblocks; y++)
															
 
																 	for (x = 0; x < nblocks; x++)
															
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -157,125 +157,55 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t nod
 
																 	struct custom_data_interface *custom_interface;
															
 
																 	custom_interface = (struct custom_data_interface *) data_interface;
															
 
																-	switch(starpu_node_get_kind(node))
															
 
																-	{
															
 
																-	case STARPU_CPU_RAM:
															
 
																-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
															
 
																-		custom_interface->cpu_ptr = (void*) malloc(size);
															
 
																-		if (!custom_interface->cpu_ptr)
															
 
																-			return -ENOMEM;
															
 
																+	size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
															
 
																+	custom_interface->cpu_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
															
 
																+	if (!custom_interface->cpu_ptr)
															
 
																+		goto fail_cpu;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-		custom_interface->cuda_ptr = (void *) malloc(size);
															
 
																-		if (!custom_interface->cuda_ptr)
															
 
																-		{
															
 
																-			free(custom_interface->cpu_ptr);
															
 
																-			custom_interface->cpu_ptr = NULL;
															
 
																-			return -ENOMEM;
															
 
																-		}
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																+	custom_interface->cuda_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
															
 
																+	if (!custom_interface->cuda_ptr)
															
 
																+		goto fail_cuda;
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-		custom_interface->opencl_ptr = malloc(size);
															
 
																-		if (custom_interface->cuda_ptr == NULL)
															
 
																-		{
															
 
																-			free(custom_interface->cpu_ptr);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-			free(custom_interface->cuda_ptr);
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																-			return -ENOMEM;
															
 
																-		}
															
 
																-#endif /* !STARPU_USE_OPENCL */
															
 
																-			
															
 
																-		break;
															
 
																+	custom_interface->opencl_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
															
 
																+	if (!custom_interface->opencl_ptr)
															
 
																+		goto fail_opencl;
															
 
																+#endif
															
 
																+
															
 
																+	return size
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-	case STARPU_CUDA_RAM:
															
 
																-	{
															
 
																-		cudaError_t err;
															
 
																-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
															
 
																-		err = cudaMalloc(&custom_interface->cuda_ptr, size);
															
 
																-		if (err != cudaSuccess)
															
 
																-			return -ENOMEM;
															
 
																-
															
 
																-		err = cudaMalloc(&custom_interface->cpu_ptr, size);
															
 
																-		if (err != cudaSuccess)
															
 
																-		{
															
 
																-			cudaFree(custom_interface->cuda_ptr);
															
 
																-			return -ENOMEM;
															
 
																-		}
															
 
																-		break;
															
 
																-	}
															
 
																+		+size
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-	case STARPU_OPENCL_RAM:
															
 
																-	{
															
 
																-		cl_int err;
															
 
																-		cl_mem memory;
															
 
																-		ssize_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
															
 
																-		err = starpu_opencl_allocate_memory(&memory, size, CL_MEM_READ_WRITE);
															
 
																-		if (err != CL_SUCCESS)
															
 
																-			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-		custom_interface->opencl_ptr = memory;
															
 
																-
															
 
																-		break;
															
 
																-	}
															
 
																-#endif /* !STARPU_USE_OPENCL */
															
 
																-	default:
															
 
																-		assert(0);
															
 
																-	}
															
 
																-
															
 
																-	/* XXX We may want to return cpu_size + cuda_size + ... */
															
 
																-	return size;
															
 
																+		+size
															
 
																+#endif
															
 
																+		;
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+fail_opencl:
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
															
 
																+#endif
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+fail_cuda:
															
 
																+#endif
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
															
 
																+fail_cpu:
															
 
																+	return -ENOMEM;
															
 
																 }
															
 
																 static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
															
 
																 {
															
 
																-	struct custom_data_interface *custom_interface;
															
 
																-	custom_interface = (struct custom_data_interface *) data_interface;
															
 
																+	struct custom_data_interface *custom_interface = (struct custom_data_interface *) data_interface;
															
 
																+	size_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
															
 
																-	switch(starpu_node_get_kind(node))
															
 
																-	{
															
 
																-	case STARPU_CPU_RAM:
															
 
																-		if (custom_interface->cpu_ptr != NULL)
															
 
																-		{
															
 
																-			free(custom_interface->cpu_ptr);
															
 
																-			custom_interface->cpu_ptr = NULL;
															
 
																-		}
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-		if (custom_interface->cuda_ptr != NULL)
															
 
																-		{
															
 
																-			free(custom_interface->cuda_ptr);
															
 
																-			custom_interface->cuda_ptr = NULL;
															
 
																-		}
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
															
 
																+#endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-		if (custom_interface->opencl_ptr != NULL)
															
 
																-		{
															
 
																-			free(custom_interface->opencl_ptr);
															
 
																-			custom_interface->opencl_ptr = NULL;
															
 
																-		}
															
 
																-#endif /* !STARPU_USE_OPENCL */
															
 
																-		break;
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	case STARPU_CUDA_RAM:
															
 
																-		if (custom_interface->cpu_ptr != NULL)
															
 
																-		{
															
 
																-			cudaError_t err;
															
 
																-			err = cudaFree(custom_interface->cpu_ptr);
															
 
																-			if (err != cudaSuccess)
															
 
																-				fprintf(stderr, "cudaFree failed...\n");
															
 
																-		}
															
 
																-		if (custom_interface->cuda_ptr != NULL)
															
 
																-		{
															
 
																-			cudaError_t err;
															
 
																-			err = cudaFree(custom_interface->cuda_ptr);
															
 
																-			if (err != cudaSuccess)
															
 
																-				fprintf(stderr, "cudaFree failed...\n");
															
 
																-		}
															
 
																-		break;
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																-	default:
															
 
																-		assert(0);
															
 
																-	}
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
															
 
																+#endif
															
 
																 }
															
 
																 static void*
															
--- a/examples/interface/complex.c
+++ b/examples/interface/complex.c
@@ -18,6 +18,8 @@
 
																 #include "complex_interface.h"
															
 
																 #include "complex_codelet.h"
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																 {
															
 
																        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
															
@@ -37,6 +39,8 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 
																                /* Old card does not support doubles */
															
 
																                return 0;
															
 
																        }
															
 
																+#else
															
 
																+       return 1;
															
 
																 #endif
															
 
																 }
															
@@ -57,7 +61,8 @@ struct starpu_codelet cl_copy =
 
																 #endif
															
 
																 	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_W},
															
 
																-	.can_execute = can_execute
															
 
																+	.can_execute = can_execute,
															
 
																+	.name = "cl_copy"
															
 
																 };
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -75,6 +80,9 @@ int main(int argc, char **argv)
 
																 	double copy_real = 78.0;
															
 
																 	double copy_imaginary = 78.0;
															
 
																+	int compare;
															
 
																+	int *compare_ptr = &compare;
															
 
																+
															
 
																 	ret = starpu_init(NULL);
															
 
																 	if (ret == -ENODEV) return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
@@ -88,60 +96,66 @@ int main(int argc, char **argv)
 
																 	starpu_complex_data_register(&handle2, 0, &copy_real, &copy_imaginary, 1);
															
 
																 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	ret = starpu_insert_task(&cl_compare,
															
 
																 				 STARPU_R, handle1,
															
 
																 				 STARPU_R, handle2,
															
 
																+				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
															
 
																 				 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																+	starpu_task_wait_for_all();
															
 
																+	if (compare != 0)
															
 
																+	{
															
 
																+	     FPRINTF(stderr, "Complex numbers should NOT be similar\n");
															
 
																+	     goto end;
															
 
																+	}
															
 
																 	ret = starpu_insert_task(&cl_copy,
															
 
																 				 STARPU_R, handle1,
															
 
																 				 STARPU_W, handle2,
															
 
																 				 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	ret = starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	ret = starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																 	ret = starpu_insert_task(&cl_compare,
															
 
																 				 STARPU_R, handle1,
															
 
																 				 STARPU_R, handle2,
															
 
																+				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
															
 
																 				 0);
															
 
																-	if (ret == -ENODEV) goto enodev;
															
 
																+	if (ret == -ENODEV) goto end;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
 
																-#warning get the comparison result and return it as the application return code
															
 
																-
															
 
																 	starpu_task_wait_for_all();
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																-#endif
															
 
																-	starpu_shutdown();
															
 
																-	return 0;
															
 
																+	if (compare != 1)
															
 
																+	{
															
 
																+	     FPRINTF(stderr, "Complex numbers should be similar\n");
															
 
																+	}
															
 
																-enodev:
															
 
																+end:
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																+	{
															
 
																+	     int ret2 = starpu_opencl_unload_opencl(&opencl_program);
															
 
																+	     STARPU_CHECK_RETURN_VALUE(ret2, "starpu_opencl_unload_opencl");
															
 
																+	}
															
 
																 #endif
															
 
																 	starpu_data_unregister(handle1);
															
 
																 	starpu_data_unregister(handle2);
															
 
																 	starpu_shutdown();
															
 
																-	return 77;
															
 
																+	if (ret == -ENODEV) return 77; else return !compare;
															
 
																 }
															
--- a/examples/interface/complex_codelet.h
+++ b/examples/interface/complex_codelet.h
@@ -20,7 +20,7 @@
 
																 #ifndef __COMPLEX_CODELET_H
															
 
																 #define __COMPLEX_CODELET_H
															
 
																-void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+void compare_complex_codelet(void *descr[], void *_args)
															
 
																 {
															
 
																 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
															
 
																 	double *real1 = STARPU_COMPLEX_GET_REAL(descr[0]);
															
@@ -30,7 +30,10 @@ void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
																 	double *real2 = STARPU_COMPLEX_GET_REAL(descr[1]);
															
 
																 	double *imaginary2 = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
															
 
																-	int compare = (nx1 == nx2);
															
 
																+	int *compare;
															
 
																+
															
 
																+	starpu_codelet_unpack_args(_args, &compare);
															
 
																+	*compare = (nx1 == nx2);
															
 
																 	if (nx1 == nx2)
															
 
																 	{
															
 
																 		int i;
															
@@ -38,19 +41,19 @@ void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
																 		{
															
 
																 			if (real1[i] != real2[i] || imaginary1[i] != imaginary2[i])
															
 
																 			{
															
 
																-				compare = 0;
															
 
																+				*compare = 0;
															
 
																 				break;
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
 
																-	fprintf(stderr, "Complex numbers are%s similar\n", compare==0 ? " NOT" : "");
															
 
																 }
															
 
																 struct starpu_codelet cl_compare =
															
 
																 {
															
 
																 	.cpu_funcs = {compare_complex_codelet, NULL},
															
 
																 	.nbuffers = 2,
															
 
																-	.modes = {STARPU_R, STARPU_R}
															
 
																+	.modes = {STARPU_R, STARPU_R},
															
 
																+	.name = "cl_compare"
															
 
																 };
															
 
																 void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
@@ -70,7 +73,8 @@ struct starpu_codelet cl_display =
 
																 {
															
 
																 	.cpu_funcs = {display_complex_codelet, NULL},
															
 
																 	.nbuffers = 1,
															
 
																-	.modes = {STARPU_R}
															
 
																+	.modes = {STARPU_R},
															
 
																+	.name = "cl_display"
															
 
																 };
															
 
																 #endif /* __COMPLEX_CODELET_H */
															
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -62,89 +62,36 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 
																 {
															
 
																 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
															
 
																-	unsigned fail = 0;
															
 
																 	double *addr_real = 0;
															
 
																 	double *addr_imaginary = 0;
															
 
																 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
															
 
																-	enum starpu_node_kind kind = starpu_node_get_kind(node);
															
 
																-
															
 
																-	switch(kind)
															
 
																-	{
															
 
																-		case STARPU_CPU_RAM:
															
 
																-			addr_real = malloc(requested_memory);
															
 
																-			addr_imaginary = malloc(requested_memory);
															
 
																-			if (!addr_real || !addr_imaginary)
															
 
																-				fail = 1;
															
 
																-			break;
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-		case STARPU_CUDA_RAM:
															
 
																-		{
															
 
																-			cudaError_t status;
															
 
																-			status = cudaMalloc((void **)&addr_real, requested_memory);
															
 
																-			if (!addr_real || (status != cudaSuccess))
															
 
																-			{
															
 
																-				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
															
 
																-					STARPU_CUDA_REPORT_ERROR(status);
															
 
																-
															
 
																-				fail = 1;
															
 
																-			}
															
 
																-			else
															
 
																-			{
															
 
																-				status = cudaMalloc((void **)&addr_imaginary, requested_memory);
															
 
																-				if (!addr_imaginary || (status != cudaSuccess))
															
 
																-				{
															
 
																-					if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
															
 
																-						STARPU_CUDA_REPORT_ERROR(status);
															
 
																-
															
 
																-					fail = 1;
															
 
																-				}
															
 
																-			}
															
 
																-
															
 
																-			break;
															
 
																-		}
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	        case STARPU_OPENCL_RAM:
															
 
																-		{
															
 
																-			int ret;
															
 
																-			cl_mem real, imaginary;
															
 
																-			ret = starpu_opencl_allocate_memory(&real, requested_memory, CL_MEM_READ_WRITE);
															
 
																-			if (ret != CL_SUCCESS)
															
 
																-			{
															
 
																-				fail = 1;
															
 
																-				break;
															
 
																-			}
															
 
																-			else
															
 
																-			{
															
 
																-				addr_real = (double *) real;
															
 
																-			}
															
 
																-
															
 
																-			ret = starpu_opencl_allocate_memory(&imaginary, requested_memory, CL_MEM_READ_WRITE);
															
 
																-			if (ret != CL_SUCCESS)
															
 
																-			{
															
 
																-				fail = 1;
															
 
																-				break;
															
 
																-			}
															
 
																-			else
															
 
																-			{
															
 
																-				addr_imaginary = (double *) imaginary;
															
 
																-			}
															
 
																-			break;
															
 
																-		}
															
 
																-#endif
															
 
																-		default:
															
 
																-			STARPU_ABORT();
															
 
																-	}
															
 
																-
															
 
																-	if (fail)
															
 
																-		return -ENOMEM;
															
 
																+	addr_real = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
															
 
																+	if (!addr_real)
															
 
																+		goto fail_real;
															
 
																+	addr_imaginary = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
															
 
																+	if (!addr_imaginary)
															
 
																+		goto fail_imaginary;
															
 
																 	/* update the data properly in consequence */
															
 
																 	complex_interface->real = addr_real;
															
 
																 	complex_interface->imaginary = addr_imaginary;
															
 
																 	return 2*requested_memory;
															
 
																+
															
 
																+fail_imaginary:
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) addr_real, requested_memory);
															
 
																+fail_real:
															
 
																+	return -ENOMEM;
															
 
																+}
															
 
																+
															
 
																+static void complex_free_data_on_node(void *data_interface, uint32_t node)
															
 
																+{
															
 
																+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
															
 
																+	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
															
 
																+
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
															
 
																+	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
															
 
																 }
															
 
																 static size_t complex_get_size(starpu_data_handle_t handle)
															
@@ -338,6 +285,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
																 {
															
 
																 	.register_data_handle = complex_register_data_handle,
															
 
																 	.allocate_data_on_node = complex_allocate_data_on_node,
															
 
																+	.free_data_on_node = complex_free_data_on_node,
															
 
																 	.copy_methods = &complex_copy_methods,
															
 
																 	.get_size = complex_get_size,
															
 
																 	.footprint = complex_footprint,
															
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -121,9 +121,27 @@ int compareL2fe(const float* reference, const float* data, const unsigned int le
 
																     return error < epsilon ? 0 : 1;
															
 
																 }
															
 
																+static struct starpu_perfmodel starpu_matvecmult_model =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "matvecmult"
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+	.where = STARPU_OPENCL,
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        .opencl_funcs[0] = opencl_codelet,
															
 
																+#endif
															
 
																+        .nbuffers = 3,
															
 
																+	.modes[0] = STARPU_R,
															
 
																+	.modes[1] = STARPU_R,
															
 
																+	.modes[2] = STARPU_RW,
															
 
																+	.model = &starpu_matvecmult_model
															
 
																+};
															
 
																+
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	struct starpu_codelet cl = {};
															
 
																 	struct starpu_conf conf;
															
@@ -179,16 +197,6 @@ int main(int argc, char **argv)
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
															
 
																 #endif
															
 
																-	cl.where = STARPU_OPENCL;
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-        cl.opencl_funcs[0] = opencl_codelet;
															
 
																-#endif
															
 
																-        cl.nbuffers = 3;
															
 
																-	cl.modes[0] = STARPU_R;
															
 
																-	cl.modes[1] = STARPU_R;
															
 
																-	cl.modes[2] = STARPU_RW;
															
 
																-        cl.model = NULL;
															
 
																-
															
 
																         struct starpu_task *task = starpu_task_create();
															
 
																         task->cl = &cl;
															
 
																         task->callback_func = NULL;
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -76,6 +76,7 @@ static void init_problem_data(void)
 
																 {
															
 
																 	unsigned i,j;
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
															
 
																 	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
															
 
																 	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
															
@@ -104,6 +105,7 @@ static void init_problem_data(void)
 
																 			C[j+i*ydim] = (TYPE)(0);
															
 
																 		}
															
 
																 	}
															
 
																+#endif
															
 
																 }
															
 
																 static void partition_mult_data(void)
															
@@ -281,8 +283,7 @@ static void parse_args(int argc, char **argv)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	struct timeval start;
															
 
																-	struct timeval end;
															
 
																+	double start, end;
															
 
																 	int ret;
															
 
																 	parse_args(argc, argv);
															
@@ -301,7 +302,7 @@ int main(int argc, char **argv)
 
																 	init_problem_data();
															
 
																 	partition_mult_data();
															
 
																-	gettimeofday(&start, NULL);
															
 
																+	start = starpu_timing_now();
															
 
																 	unsigned x, y, iter;
															
 
																 	for (iter = 0; iter < niter; iter++)
															
@@ -330,8 +331,9 @@ int main(int argc, char **argv)
 
																 	}
															
 
																-	gettimeofday(&end, NULL);
															
 
																-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	end = starpu_timing_now();
															
 
																+
															
 
																+	double timing = end - start;
															
 
																 	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
															
--- a/examples/openmp/vector_scal.c
+++ b/examples/openmp/vector_scal.c
@@ -51,7 +51,7 @@ void scal_cpu_func(void *buffers[], void *_args)
 
																 static struct starpu_perfmodel vector_scal_model =
															
 
																 {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "vector_scale_parallel"
															
 
																+	.symbol = "vector_scal_parallel"
															
 
																 };
															
 
																 static struct starpu_codelet cl =
															
--- a/examples/spmd/vector_scal_spmd.c
+++ b/examples/spmd/vector_scal_spmd.c
@@ -75,7 +75,7 @@ void scal_cpu_func(void *buffers[], void *_args)
 
																 static struct starpu_perfmodel vector_scal_model =
															
 
																 {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
 
																-	.symbol = "vector_scale_parallel"
															
 
																+	.symbol = "vector_scal_parallel"
															
 
																 };
															
 
																 static struct starpu_codelet cl =
															
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -152,7 +152,7 @@ static void init_problem(int argc, char **argv, int rank, int world_size)
 
																  */
															
 
																 struct timeval start;
															
 
																-struct timeval end;
															
 
																+double begin, end;
															
 
																 double timing; 
															
 
																 void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
															
@@ -242,11 +242,13 @@ int main(int argc, char **argv)
 
																 	gettimeofday(&start, NULL);
															
 
																+	begin = starpu_timing_now();
															
 
																+
															
 
																 	starpu_tag_notify_from_apps(TAG_INIT_TASK);
															
 
																 	wait_end_tasks(rank);
															
 
																-	gettimeofday(&end, NULL);
															
 
																+	end = starpu_timing_now();
															
 
																 #ifdef STARPU_USE_MPI
															
 
																 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
@@ -264,7 +266,7 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 	/* timing in us */
															
 
																-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	timing = end - begin;
															
 
																 	double min_timing = timing;
															
 
																 	double max_timing = timing;
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -66,6 +66,10 @@ extern "C"
 
																 {
															
 
																 #endif
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+#define main starpu_main
															
 
																+#endif
															
 
																+
															
 
																 enum starpu_archtype
															
 
																 {
															
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
@@ -140,10 +144,10 @@ struct starpu_conf
 
																 	int disable_asynchronous_copy;
															
 
																         /* indicate if asynchronous copies to CUDA devices should be disabled */
															
 
																-	int disable_cuda_asynchronous_copy;
															
 
																+	int disable_asynchronous_cuda_copy;
															
 
																         /* indicate if asynchronous copies to OpenCL devices should be disabled */
															
 
																-	int disable_opencl_asynchronous_copy;
															
 
																+	int disable_asynchronous_opencl_copy;
															
 
																 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
															
 
																 	unsigned *cuda_opengl_interoperability;
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -26,6 +26,8 @@
 
																 #undef STARPU_USE_OPENCL
															
 
																 #undef STARPU_USE_GORDON
															
 
																+#undef STARPU_SIMGRID
															
 
																+
															
 
																 #undef STARPU_HAVE_ICC
															
 
																 #undef STARPU_USE_MPI
															
@@ -93,5 +95,7 @@ typedef ssize_t starpu_ssize_t;
 
																 #undef STARPU_QUICK_CHECK
															
 
																 #undef STARPU_USE_DRAND48
															
 
																 #undef STARPU_USE_ERAND48_R
															
 
																+#undef STARPU_HAVE_NEARBYINTF
															
 
																+#undef STARPU_HAVE_RINTF
															
 
																 #endif
															
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -87,6 +87,7 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
																 int starpu_malloc(void **A, size_t dim);
															
 
																 int starpu_free(void *A);
															
 
																+void starpu_memory_display_stats();
															
 
																 /* XXX These macros are provided to avoid breaking old codes. But consider
															
 
																  * these function names as deprecated. */
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -144,6 +144,11 @@ int starpu_data_interface_get_next_id(void);
 
																 void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, void *data_interface, struct starpu_data_interface_ops *ops);
															
 
																 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
															
 
																+/* Allocate SIZE bytes on node NODE */
															
 
																+uintptr_t starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size);
															
 
																+/* Free ADDR on node NODE */
															
 
																+void starpu_free_buffer_on_node(uint32_t dst_node, uintptr_t addr, size_t size);
															
 
																+
															
 
																 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
															
 
																  * interface does not support this operation or data for this handle is not
															
 
																  * allocated on that node. */
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -188,7 +188,7 @@ struct starpu_perfmodel
 
																 	unsigned is_loaded;
															
 
																 	unsigned benchmarking;
															
 
																-#if defined(_MSC_VER)
															
 
																+#if defined(_MSC_VER) || defined(STARPU_SIMGRID)
															
 
																 	void *model_rwlock;
															
 
																 #else
															
 
																 	pthread_rwlock_t model_rwlock;
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -272,15 +272,15 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
 
																  *	Performance predictions
															
 
																  */
															
 
																-/* Return the current date */
															
 
																+/* Return the current date in µs */
															
 
																 double starpu_timing_now(void);
															
 
																-/* Returns expected task duration in Âµs */
															
 
																+/* Returns expected task duration in µs */
															
 
																 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 /* Returns an estimated speedup factor relative to CPU speed */
															
 
																 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
															
 
																-/* Returns expected data transfer time in Âµs */
															
 
																+/* Returns expected data transfer time in µs */
															
 
																 double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct starpu_task *task);
															
 
																-/* Predict the transfer time (in Âµs) to move a handle to a memory node */
															
 
																+/* Predict the transfer time (in µs) to move a handle to a memory node */
															
 
																 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
															
 
																 /* Returns expected power consumption in J */
															
 
																 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
															
@@ -288,7 +288,7 @@ double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_arc
 
																 double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																 /* Return the expected duration of the entire task bundle in µs. */
															
 
																 double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
															
 
																-/* Return the time (in Âµs) expected to transfer all data used within the bundle */
															
 
																+/* Return the time (in µs) expected to transfer all data used within the bundle */
															
 
																 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
															
 
																 /* Return the expected power consumption of the entire task bundle in J. */
															
 
																 double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -70,14 +70,14 @@ extern "C"
 
																 #ifdef STARPU_NO_ASSERT
															
 
																 #define STARPU_ASSERT(x)		do { (void) (x);} while(0)
															
 
																-#define STARPU_ASSERT_MSG(x, msg)	do { (void) (x);} while(0)
															
 
																+#define STARPU_ASSERT_MSG(x, msg, ...)	do { (void) (x);} while(0)
															
 
																 #else
															
 
																 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
															
 
																 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) *(int*)NULL = 0; } while(0)
															
 
																-#    define STARPU_ASSERT_MSG(x, msg)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] %s\n", __func__, msg); *(int*)NULL = 0; }} while(0)
															
 
																+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __func__, ## __VA_ARGS__); *(int*)NULL = 0; }} while(0)
															
 
																 #  else
															
 
																 #    define STARPU_ASSERT(x)		assert(x)
															
 
																-#    define STARPU_ASSERT_MSG(x, msg)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] %s\n", __func__, msg); } ; assert(x); } while(0)
															
 
																+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { fprintf(stderr, "[starpu][%s][assert failure] " msg "\n", __func__, ## __VA_ARGS__); } ; assert(x); } while(0)
															
 
																 #  endif
															
 
																 #endif
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -17,16 +17,28 @@
 
																 CC=$(MPICC)
															
 
																 CCLD=$(MPICC)
															
 
																-if STARPU_MPI_CHECK
															
 
																+if STARPU_HAVE_WINDOWS
															
 
																+LOADER_BIN		=
															
 
																+else
															
 
																+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
															
 
																+LOADER			=	loader
															
 
																+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
															
 
																+loader_SOURCES		=	../../tests/loader.c
															
 
																+endif
															
 
																+
															
 
																 if STARPU_HAVE_AM111
															
 
																-LOG_COMPILER	 	=	$(MPIEXEC) -np 2
															
 
																+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																+LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
															
 
																 else
															
 
																-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
															
 
																+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
															
 
																 endif
															
 
																-TESTS			=	$(check_PROGRAMS)
															
 
																+
															
 
																+if STARPU_MPI_CHECK
															
 
																+TESTS			=	$(starpu_mpi_EXAMPLES)
															
 
																 endif
															
 
																-check_PROGRAMS =
															
 
																+check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
															
 
																+starpu_mpi_EXAMPLES =
															
 
																 BUILT_SOURCES =
															
@@ -76,7 +88,7 @@ examplebin_PROGRAMS +=				\
 
																 stencil_stencil5_LDADD =		\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																-check_PROGRAMS	+=	\
															
 
																+starpu_mpi_EXAMPLES	+=	\
															
 
																 	stencil/stencil5
															
 
																 ##################
															
@@ -145,7 +157,7 @@ cholesky_mpi_cholesky_distributed_LDADD =	\
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																 	$(STARPU_BLAS_LDFLAGS)
															
 
																-check_PROGRAMS +=					\
															
 
																+starpu_mpi_EXAMPLES +=				\
															
 
																 	cholesky/mpi_cholesky			\
															
 
																 	cholesky/mpi_cholesky_distributed
															
 
																 endif
															
@@ -154,7 +166,7 @@ endif
 
																 # complex example #
															
 
																 ###################
															
 
																-examplebin_PROGRAMS +=				\
															
 
																+examplebin_PROGRAMS +=			\
															
 
																 	complex/mpi_complex
															
 
																 complex_mpi_complex_SOURCES =		\
															
@@ -164,7 +176,7 @@ complex_mpi_complex_SOURCES =		\
 
																 complex_mpi_complex_LDADD =		\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																-check_PROGRAMS	+=	\
															
 
																+starpu_mpi_EXAMPLES	+=			\
															
 
																 	complex/mpi_complex
															
 
																 endif
															
--- a/mpi/examples/cholesky/mpi_cholesky.c
+++ b/mpi/examples/cholesky/mpi_cholesky.c
@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	starpu_mpi_init(&argc, &argv);
															
 
																+	starpu_mpi_init(&argc, &argv, 1);
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
															
@@ -51,18 +51,18 @@ int main(int argc, char **argv)
 
																 	if (dblockx == -1 || dblocky == -1)
															
 
																 	{
															
 
																-	     int factor;
															
 
																-	     dblockx = nodes;
															
 
																-	     dblocky = 1;
															
 
																-	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																-	     {
															
 
																-		  if (nodes % factor == 0)
															
 
																-		  {
															
 
																-		       dblockx = nodes/factor;
															
 
																-		       dblocky = factor;
															
 
																-		       break;
															
 
																-		  }
															
 
																-	     }
															
 
																+		int factor;
															
 
																+		dblockx = nodes;
															
 
																+		dblocky = 1;
															
 
																+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																+		{
															
 
																+			if (nodes % factor == 0)
															
 
																+			{
															
 
																+				dblockx = nodes/factor;
															
 
																+				dblocky = factor;
															
 
																+				break;
															
 
																+			}
															
 
																+		}
															
 
																 	}
															
 
																 	unsigned i,j,x,y;
															
@@ -198,7 +198,7 @@ int main(int argc, char **argv)
 
																 	}
															
 
																 	int correctness = 1;
															
 
																-	for(x = 0; x < nblocks ;  x++)
															
 
																+	for(x = 0; x < nblocks ; x++)
															
 
																 	{
															
 
																 		for (y = 0; y < nblocks; y++)
															
 
																 		{
															
--- a/mpi/examples/cholesky/mpi_cholesky.h
+++ b/mpi/examples/cholesky/mpi_cholesky.h
@@ -57,31 +57,31 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
																 	{
															
 
																 		if (strcmp(argv[i], "-size") == 0)
															
 
																 		{
															
 
																-		        char *argptr;
															
 
																+			char *argptr;
															
 
																 			size = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																 		if (strcmp(argv[i], "-dblockx") == 0)
															
 
																 		{
															
 
																-		        char *argptr;
															
 
																+			char *argptr;
															
 
																 			dblockx = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																-		
															
 
																+
															
 
																 		if (strcmp(argv[i], "-dblocky") == 0)
															
 
																 		{
															
 
																-		        char *argptr;
															
 
																+			char *argptr;
															
 
																 			dblocky = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																-	
															
 
																+
															
 
																 		if (strcmp(argv[i], "-nblocks") == 0)
															
 
																 		{
															
 
																-		        char *argptr;
															
 
																+			char *argptr;
															
 
																 			nblocks = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
 
																 		if (strcmp(argv[i], "-nbigblocks") == 0)
															
 
																 		{
															
 
																-		        char *argptr;
															
 
																+			char *argptr;
															
 
																 			nbigblocks = strtol(argv[++i], &argptr, 10);
															
 
																 		}
															
--- a/mpi/examples/cholesky/mpi_cholesky_codelets.c
+++ b/mpi/examples/cholesky/mpi_cholesky_codelets.c
@@ -79,7 +79,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
																 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
															
 
																 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																-	for(x = 0; x < nblocks ;  x++)
															
 
																+	for(x = 0; x < nblocks ; x++)
															
 
																 	{
															
 
																 		for (y = 0; y < nblocks; y++)
															
 
																 		{
															
@@ -148,7 +148,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
																 	starpu_task_wait_for_all();
															
 
																-	for(x = 0; x < nblocks ;  x++)
															
 
																+	for(x = 0; x < nblocks ; x++)
															
 
																 	{
															
 
																 		for (y = 0; y < nblocks; y++)
															
 
																 		{
															
--- a/mpi/examples/cholesky/mpi_cholesky_distributed.c
+++ b/mpi/examples/cholesky/mpi_cholesky_distributed.c
@@ -42,7 +42,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
															
@@ -50,18 +50,18 @@ int main(int argc, char **argv)
 
																 	if (dblockx == -1 || dblocky == -1)
															
 
																 	{
															
 
																-	     int factor;
															
 
																-	     dblockx = nodes;
															
 
																-	     dblocky = 1;
															
 
																-	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																-	     {
															
 
																-		  if (nodes % factor == 0)
															
 
																-		  {
															
 
																-		       dblockx = nodes/factor;
															
 
																-		       dblocky = factor;
															
 
																-		       break;
															
 
																-		  }
															
 
																-	     }
															
 
																+		int factor;
															
 
																+		dblockx = nodes;
															
 
																+		dblocky = 1;
															
 
																+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																+		{
															
 
																+			if (nodes % factor == 0)
															
 
																+			{
															
 
																+				dblockx = nodes/factor;
															
 
																+				dblocky = factor;
															
 
																+				break;
															
 
																+			}
															
 
																+		}
															
 
																 	}
															
 
																 	unsigned i,j,x,y;
															
--- a/mpi/examples/cholesky/mpi_cholesky_kernels.c
+++ b/mpi/examples/cholesky/mpi_cholesky_kernels.c
@@ -29,7 +29,7 @@
 
																 #endif
															
 
																 /*
															
 
																- *   U22
															
 
																+ * U22
															
 
																  */
															
 
																 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
															
--- a/mpi/examples/complex/mpi_complex.c
+++ b/mpi/examples/complex/mpi_complex.c
@@ -18,14 +18,28 @@
 
																 #include <interface/complex_interface.h>
															
 
																 #include <interface/complex_codelet.h>
															
 
																+void display_foo_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	fprintf(stderr, "foo = %d\n", *foo);
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet foo_display =
															
 
																+{
															
 
																+	.cpu_funcs = {display_foo_codelet, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_R}
															
 
																+};
															
 
																+
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int rank, nodes;
															
 
																 	int ret;
															
 
																+	int compare;
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
															
@@ -46,33 +60,53 @@ int main(int argc, char **argv)
 
																 			double real2[2] = {14.0, 12.0};
															
 
																 			double imaginary2[2] = {17.0, 19.0};
															
 
																 			starpu_data_handle_t handle2;
															
 
																-			MPI_Status status;
															
 
																+
															
 
																+			int *compare_ptr = &compare;
															
 
																 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
															
 
																+			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
															
 
																+
															
 
																 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
															
 
																-			starpu_mpi_send(handle, 1, 10, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
															
 
																+			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
															
 
																-			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
															
 
																-			starpu_mpi_recv(handle2, 1, 11, MPI_COMM_WORLD, &status);
															
 
																 			starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
															
 
																-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, 0);
															
 
																+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
															
 
																+
															
 
																+			{
															
 
																+				// We send a dummy variable only to check communication with predefined datatypes
															
 
																+				int foo=12;
															
 
																+				starpu_data_handle_t foo_handle;
															
 
																+				starpu_variable_data_register(&foo_handle, 0, (uintptr_t)&foo, sizeof(foo));
															
 
																+				starpu_mpi_isend_detached(foo_handle, 1, 40, MPI_COMM_WORLD, NULL, NULL);
															
 
																+				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
															
 
																+			}
															
 
																 		}
															
 
																 		else if (rank == 1)
															
 
																 		{
															
 
																 			double real[2] = {0.0, 0.0};
															
 
																 			double imaginary[2] = {0.0, 0.0};
															
 
																 			starpu_data_handle_t handle;
															
 
																-			MPI_Status status;
															
 
																 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
															
 
																-			starpu_mpi_recv(handle, 0, 10, MPI_COMM_WORLD, &status);
															
 
																+			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
															
 
																 			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
															
 
																-			starpu_mpi_send(handle, 0, 11, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
															
 
																+
															
 
																+			{
															
 
																+				// We send a dummy variable only to check communication with predefined datatypes
															
 
																+				int foo=12;
															
 
																+				starpu_data_handle_t foo_handle;
															
 
																+				starpu_variable_data_register(&foo_handle, -1, (uintptr_t)NULL, sizeof(foo));
															
 
																+				starpu_mpi_irecv_detached(foo_handle, 0, 40, MPI_COMM_WORLD, NULL, NULL);
															
 
																+				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
															
 
																+			}
															
 
																+
															
 
																 		}
															
 
																 	}
															
 
																 	starpu_task_wait_for_all();
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																-	return ret;
															
 
																+	if (rank == 0) return !compare; else return ret;
															
 
																 }
															
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -301,7 +301,7 @@ static void init_matrix(int rank)
 
																 		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
															
 
																 	}
															
 
																 #endif
															
 
																-	
															
 
																+
															
 
																 	for (k = 0; k < nblocks; k++)
															
 
																 	{
															
 
																 #ifdef SINGLE_TMP1221
															
@@ -333,7 +333,7 @@ static void init_matrix(int rank)
 
																 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
															
 
																 			allocated_memory_extra += blocksize;
															
 
																 			STARPU_ASSERT(tmp_12_block[i][k]);
															
 
																-	
															
 
																+
															
 
																 			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
															
 
																 				(uintptr_t)tmp_12_block[i][k],
															
 
																 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
@@ -344,7 +344,7 @@ static void init_matrix(int rank)
 
																 			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
															
 
																 			allocated_memory_extra += blocksize;
															
 
																 			STARPU_ASSERT(tmp_21_block[i][k]);
															
 
																-	
															
 
																+
															
 
																 			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
															
 
																 				(uintptr_t)tmp_21_block[i][k],
															
 
																 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
@@ -381,7 +381,7 @@ static void display_grid(int rank, unsigned nblocks)
 
																 	//if (rank == 0)
															
 
																 	{
															
 
																 		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
															
 
																-		
															
 
																+
															
 
																 		unsigned i, j;
															
 
																 		for (j = 0; j < nblocks; j++)
															
 
																 		{
															
@@ -428,7 +428,7 @@ int main(int argc, char **argv)
 
																 	/* We disable sequential consistency in this example */
															
 
																 	starpu_data_set_default_sequential_consistency_flag(0);
															
 
																-	starpu_mpi_init(NULL, NULL);
															
 
																+	starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_ASSERT(p*q == world_size);
															
@@ -534,7 +534,7 @@ int main(int argc, char **argv)
 
																 		y2 = calloc(size, sizeof(TYPE));
															
 
																 		STARPU_ASSERT(y);
															
 
																-		
															
 
																+
															
 
																 		if (rank == 0)
															
 
																 		{
															
 
																 			for (ind = 0; ind < size; ind++)
															
@@ -546,13 +546,13 @@ int main(int argc, char **argv)
 
																 		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
															
 
																 		/* Compute y2 = y2 - y */
															
 
																-	        CPU_AXPY(size, -1.0, y, 1, y2, 1);
															
 
																-	
															
 
																-	        TYPE err = CPU_ASUM(size, y2, 1);
															
 
																-	        int max = CPU_IAMAX(size, y2, 1);
															
 
																-	
															
 
																-	        fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
															
 
																-	        fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
															
 
																+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
															
 
																+
															
 
																+		TYPE err = CPU_ASUM(size, y2, 1);
															
 
																+		int max = CPU_IAMAX(size, y2, 1);
															
 
																+
															
 
																+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
															
 
																+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
															
 
																 #endif
															
 
																 	}
															
--- a/mpi/examples/mpi_lu/plu_solve.c
+++ b/mpi/examples/mpi_lu/plu_solve.c
@@ -25,19 +25,19 @@
 
																 static double frobenius_norm(TYPE *v, unsigned n)
															
 
																 {
															
 
																-        double sum2 = 0.0;
															
 
																+	double sum2 = 0.0;
															
 
																-        /* compute sqrt(Sum(|x|^2)) */
															
 
																+	/* compute sqrt(Sum(|x|^2)) */
															
 
																-        unsigned i,j;
															
 
																-        for (j = 0; j < n; j++)
															
 
																-        for (i = 0; i < n; i++)
															
 
																-        {
															
 
																-                double a = fabsl((double)v[i+n*j]);
															
 
																-                sum2 += a*a;
															
 
																-        }
															
 
																+	unsigned i,j;
															
 
																+	for (j = 0; j < n; j++)
															
 
																+		for (i = 0; i < n; i++)
															
 
																+		{
															
 
																+			double a = fabsl((double)v[i+n*j]);
															
 
																+			sum2 += a*a;
															
 
																+		}
															
 
																-        return sqrt(sum2);
															
 
																+	return sqrt(sum2);
															
 
																 }
															
 
																 void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
															
@@ -105,9 +105,9 @@ static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 
																 	/* Take a copy of the upper part of the diagonal block */
															
 
																 	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
															
 
																 	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
															
 
																-		
															
 
																+
															
 
																 	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
															
 
																-	
															
 
																+
															
 
																 	free(upper_block_copy);
															
 
																 }
															
@@ -121,7 +121,7 @@ static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
 
																 	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
															
 
																 	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
															
 
																-	
															
 
																+
															
 
																 	free(lower_block_copy);
															
 
																 }
															
@@ -242,7 +242,7 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
																 		TYPE *block;
															
 
																 		int block_rank = get_block_rank(bi, bj);
															
 
																-		
															
 
																+
															
 
																 		if (block_rank == 0)
															
 
																 		{
															
 
																 			block = STARPU_PLU(get_block)(bi, bj);
															
@@ -335,60 +335,59 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 
																 	if (rank == 0)
															
 
																 	{
															
 
																-	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
															
 
																-	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
															
 
																-	
															
 
																-	        memset(L, 0, size*size*sizeof(TYPE));
															
 
																-	        memset(U, 0, size*size*sizeof(TYPE));
															
 
																-	
															
 
																-	        /* only keep the lower part */
															
 
																+		TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
															
 
																+		TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
															
 
																+
															
 
																+		memset(L, 0, size*size*sizeof(TYPE));
															
 
																+		memset(U, 0, size*size*sizeof(TYPE));
															
 
																+
															
 
																+		/* only keep the lower part */
															
 
																 		unsigned i, j;
															
 
																-	        for (j = 0; j < size; j++)
															
 
																-	        {
															
 
																-	                for (i = 0; i < j; i++)
															
 
																-	                {
															
 
																-	                        L[j+i*size] = all_r[j+i*size];
															
 
																-	                }
															
 
																-	
															
 
																-	                /* diag i = j */
															
 
																-	                L[j+j*size] = all_r[j+j*size];
															
 
																-	                U[j+j*size] = 1.0;
															
 
																-	
															
 
																-	                for (i = j+1; i < size; i++)
															
 
																-	                {
															
 
																-	                        U[j+i*size] = all_r[j+i*size];
															
 
																-	                }
															
 
																-	        }
															
 
																-	
															
 
																+		for (j = 0; j < size; j++)
															
 
																+		{
															
 
																+			for (i = 0; i < j; i++)
															
 
																+			{
															
 
																+				L[j+i*size] = all_r[j+i*size];
															
 
																+			}
															
 
																+
															
 
																+			/* diag i = j */
															
 
																+			L[j+j*size] = all_r[j+j*size];
															
 
																+			U[j+j*size] = 1.0;
															
 
																+
															
 
																+			for (i = j+1; i < size; i++)
															
 
																+			{
															
 
																+				U[j+i*size] = all_r[j+i*size];
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																 		STARPU_PLU(display_data_content)(L, size);
															
 
																 		STARPU_PLU(display_data_content)(U, size);
															
 
																-	
															
 
																-	        /* now A_err = L, compute L*U */
															
 
																-	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
															
 
																-	
															
 
																+
															
 
																+		/* now A_err = L, compute L*U */
															
 
																+		CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
															
 
																+
															
 
																 		if (display)
															
 
																 			fprintf(stderr, "\nLU\n");
															
 
																 		STARPU_PLU(display_data_content)(L, size);
															
 
																-	
															
 
																-	        /* compute "LU - A" in L*/
															
 
																-	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
															
 
																-	
															
 
																-	        TYPE err = CPU_ASUM(size*size, L, 1);
															
 
																-	        int max = CPU_IAMAX(size*size, L, 1);
															
 
																-	
															
 
																+
															
 
																+		/* compute "LU - A" in L*/
															
 
																+		CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
															
 
																+
															
 
																+		TYPE err = CPU_ASUM(size*size, L, 1);
															
 
																+		int max = CPU_IAMAX(size*size, L, 1);
															
 
																+
															
 
																 		if (display)
															
 
																 			fprintf(stderr, "DISPLAY ERROR\n");
															
 
																 		STARPU_PLU(display_data_content)(L, size);
															
 
																-	
															
 
																-	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
															
 
																-	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
															
 
																-	
															
 
																+
															
 
																+		fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
															
 
																+		fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
															
 
																+
															
 
																 		double residual = frobenius_norm(L, size);
															
 
																 		double matnorm = frobenius_norm(Asaved, size);
															
 
																-	
															
 
																+
															
 
																 		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
															
 
																 	}
															
 
																 }
															
 
																-
															
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -736,7 +736,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
																 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
															
 
																 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
															
 
																-	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
															
 
																+	if (!no_prio && (i == k + 1) && (j == k +1) ) {
															
 
																 		task->priority = STARPU_MAX_PRIO;
															
 
																 	}
															
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -22,7 +22,7 @@
 
																 ///#define VERBOSE_KERNELS	1
															
 
																 /*
															
 
																- *   U22 
															
 
																+ * U22
															
 
																  */
															
 
																 static inline void STARPU_PLU(common_u22)(void *descr[],
															
@@ -55,7 +55,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
																 	switch (s) {
															
 
																 		case 0:
															
 
																-			CPU_GEMM("N", "N", dy, dx, dz, 
															
 
																+			CPU_GEMM("N", "N", dy, dx, dz,
															
 
																 				(TYPE)-1.0, right, ld21, left, ld12,
															
 
																 				(TYPE)1.0, center, ld22);
															
 
																 			break;
															
@@ -129,7 +129,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
																 	TYPE *sub11;
															
 
																 	TYPE *sub12;
															
 
																-	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	
															
 
																+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																 	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																 	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
															
@@ -227,7 +227,7 @@ struct starpu_codelet STARPU_PLU(cl12) = {
 
																 };
															
 
																-/* 
															
 
																+/*
															
 
																  * U21
															
 
																  */
															
@@ -245,7 +245,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
																 	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
															
 
																 	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																-	
															
 
																+
															
 
																 #ifdef VERBOSE_KERNELS
															
 
																 	struct debug_info *info = _args;
															
@@ -311,7 +311,7 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 
																 {
															
 
																 	STARPU_PLU(common_u21)(descr, 1, _args);
															
 
																 }
															
 
																-#endif 
															
 
																+#endif
															
 
																 static struct starpu_perfmodel STARPU_PLU(model_21) = {
															
 
																 	.type = STARPU_HISTORY_BASED,
															
@@ -345,7 +345,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
																 {
															
 
																 	TYPE *sub11;
															
 
																-	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); 
															
 
																+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																 	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																 	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
															
@@ -367,9 +367,9 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
																 				TYPE pivot;
															
 
																 				pivot = sub11[z+z*ld];
															
 
																 				STARPU_ASSERT(pivot != 0.0);
															
 
																-		
															
 
																+
															
 
																 				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
															
 
																-		
															
 
																+
															
 
																 				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																 						&sub11[(z+1)+z*ld], 1,
															
 
																 						&sub11[z+(z+1)*ld], ld,
															
@@ -385,15 +385,15 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
																 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 				STARPU_ASSERT(pivot != 0.0);
															
 
																-				
															
 
																+
															
 
																 				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
															
 
																-				
															
 
																+
															
 
																 				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																 						&sub11[(z+1)+z*ld], 1,
															
 
																 						&sub11[z+(z+1)*ld], ld,
															
 
																 						&sub11[(z+1) + (z+1)*ld],ld);
															
 
																 			}
															
 
																-			
															
 
																+
															
 
																 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 			break;
															
@@ -440,5 +440,3 @@ struct starpu_codelet STARPU_PLU(cl11) = {
 
																 	.modes = {STARPU_RW},
															
 
																 	.model = &STARPU_PLU(model_11)
															
 
																 };
															
 
																-
															
 
																-
															
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -25,15 +25,15 @@ void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
															
 
																 	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
															
 
																-        //        fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
															
 
																-        *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
															
 
																+	//fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
															
 
																+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
															
 
																 }
															
 
																 struct starpu_codelet stencil5_cl =
															
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {stencil5_cpu, NULL},
															
 
																-        .nbuffers = 5,
															
 
																+	.nbuffers = 5,
															
 
																 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
															
 
																 };
															
@@ -75,92 +75,92 @@ static void parse_args(int argc, char **argv)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int my_rank, size, x, y, loop;
															
 
																-        int value=0, mean=0;
															
 
																-        unsigned matrix[X][Y];
															
 
																-        starpu_data_handle_t data_handles[X][Y];
															
 
																+	int my_rank, size, x, y, loop;
															
 
																+	int value=0, mean=0;
															
 
																+	unsigned matrix[X][Y];
															
 
																+	starpu_data_handle_t data_handles[X][Y];
															
 
																 	int ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	starpu_mpi_init(&argc, &argv);
															
 
																+	starpu_mpi_init(&argc, &argv, 1);
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        parse_args(argc, argv);
															
 
																+	parse_args(argc, argv);
															
 
																-        for(x = 0; x < X; x++)
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        matrix[x][y] = (my_rank+1)*10 + value;
															
 
																-                        value++;
															
 
																-                        mean += matrix[x][y];
															
 
																-                }
															
 
																-        }
															
 
																-        mean /= value;
															
 
																-
															
 
																-        for(x = 0; x < X; x++)
															
 
																+			matrix[x][y] = (my_rank+1)*10 + value;
															
 
																+			value++;
															
 
																+			mean += matrix[x][y];
															
 
																+		}
															
 
																+	}
															
 
																+	mean /= value;
															
 
																+
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        int mpi_rank = my_distrib(x, y, size);
															
 
																-                        if (mpi_rank == my_rank)
															
 
																+			int mpi_rank = my_distrib(x, y, size);
															
 
																+			if (mpi_rank == my_rank)
															
 
																 			{
															
 
																-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
															
 
																-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																-                        }
															
 
																+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
															
 
																+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																+			}
															
 
																 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
															
 
																-			      || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
															
 
																+				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
															
 
																 			{
															
 
																-                                /* I don't own that index, but will need it for my computations */
															
 
																-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
															
 
																-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																-                        }
															
 
																-                        else
															
 
																+				/* I don't own that index, but will need it for my computations */
															
 
																+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
															
 
																+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																+			}
															
 
																+			else
															
 
																 			{
															
 
																-                                /* I know it's useless to allocate anything for this */
															
 
																-                                data_handles[x][y] = NULL;
															
 
																-                        }
															
 
																-                        if (data_handles[x][y])
															
 
																+				/* I know it's useless to allocate anything for this */
															
 
																+				data_handles[x][y] = NULL;
															
 
																+			}
															
 
																+			if (data_handles[x][y])
															
 
																 			{
															
 
																-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+				starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																 			}
															
 
																-                }
															
 
																-        }
															
 
																+		}
															
 
																+	}
															
 
																-        for(loop=0 ; loop<niter; loop++)
															
 
																+	for(loop=0 ; loop<niter; loop++)
															
 
																 	{
															
 
																-                for (x = 1; x < X-1; x++)
															
 
																+		for (x = 1; x < X-1; x++)
															
 
																 		{
															
 
																-                        for (y = 1; y < Y-1; y++)
															
 
																+			for (y = 1; y < Y-1; y++)
															
 
																 			{
															
 
																-                                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
															
 
																-                                                       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
															
 
																-                                                       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
															
 
																-                                                       0);
															
 
																-                        }
															
 
																-                }
															
 
																-        }
															
 
																-        fprintf(stderr, "Waiting ...\n");
															
 
																-        starpu_task_wait_for_all();
															
 
																+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
															
 
																+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
															
 
																+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
															
 
																+						       0);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	fprintf(stderr, "Waiting ...\n");
															
 
																+	starpu_task_wait_for_all();
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																-        if (display)
															
 
																+	if (display)
															
 
																 	{
															
 
																-                fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
															
 
																-                for(x = 0; x < X; x++)
															
 
																+		fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
															
 
																+		for(x = 0; x < X; x++)
															
 
																 		{
															
 
																-                        fprintf(stdout, "[%d] ", my_rank);
															
 
																-                        for (y = 0; y < Y; y++)
															
 
																+			fprintf(stdout, "[%d] ", my_rank);
															
 
																+			for (y = 0; y < Y; y++)
															
 
																 			{
															
 
																-                                fprintf(stdout, "%3u ", matrix[x][y]);
															
 
																-                        }
															
 
																-                        fprintf(stdout, "\n");
															
 
																-                }
															
 
																-        }
															
 
																+				fprintf(stdout, "%3u ", matrix[x][y]);
															
 
																+			}
															
 
																+			fprintf(stdout, "\n");
															
 
																+		}
															
 
																+	}
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -39,8 +39,8 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 
																 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
															
 
																 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
															
 
																 int starpu_mpi_barrier(MPI_Comm comm);
															
 
																-int starpu_mpi_init(int *argc, char ***argv);
															
 
																+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
															
 
																 int starpu_mpi_initialize(void) STARPU_DEPRECATED;
															
 
																 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
															
 
																 int starpu_mpi_shutdown(void);
															
@@ -66,6 +66,10 @@ int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_
 
																 /* retrieve the current amount of communications from the current node */
															
 
																 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
															
 
																+
															
 
																+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
															
 
																+void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -38,24 +38,25 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
																 							int dest, int mpi_tag, MPI_Comm comm,
															
 
																 							unsigned detached, void (*callback)(void *), void *arg);
															
 
																 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg);
															
 
																+static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
															
 
																 /* The list of requests that have been newly submitted by the application */
															
 
																 static struct _starpu_mpi_req_list *new_requests;
															
 
																 /* The list of detached requests that have already been submitted to MPI */
															
 
																 static struct _starpu_mpi_req_list *detached_requests;
															
 
																-static pthread_mutex_t detached_requests_mutex;
															
 
																+static _starpu_pthread_mutex_t detached_requests_mutex;
															
 
																 /* Condition to wake up progression thread */
															
 
																-static pthread_cond_t cond_progression;
															
 
																+static _starpu_pthread_cond_t cond_progression;
															
 
																 /* Condition to wake up waiting for all current MPI requests to finish */
															
 
																-static pthread_cond_t cond_finished;
															
 
																-static pthread_mutex_t mutex;
															
 
																+static _starpu_pthread_cond_t cond_finished;
															
 
																+static _starpu_pthread_mutex_t mutex;
															
 
																 static pthread_t progress_thread;
															
 
																 static int running = 0;
															
 
																 /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
															
 
																-static pthread_mutex_t mutex_posted_requests;
															
 
																+static _starpu_pthread_mutex_t mutex_posted_requests;
															
 
																 static int posted_requests = 0, newer_requests, barrier_running = 0;
															
 
																 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
															
@@ -74,11 +75,11 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
																 							      enum starpu_access_mode mode)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																 	STARPU_ASSERT(req);
															
 
																-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																 	/* Initialize the request structure */
															
 
																 	req->submitted = 0;
															
@@ -101,10 +102,10 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
																 	/* Asynchronously request StarPU to fetch the data in main memory: when
															
 
																 	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
															
 
																-	 * the request is actually submitted  */
															
 
																+	 * the request is actually submitted */
															
 
																 	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return req;
															
 
																 }
															
@@ -116,16 +117,16 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
																 static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	STARPU_ASSERT(req->ptr);
															
 
																-        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, (int)req->count, &req->request);
															
 
																+	_STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, (int)req->count, &req->request);
															
 
																 	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
															
 
																-        req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+	req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
															
@@ -134,7 +135,10 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
																 	req->submitted = 1;
															
 
																 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+
															
 
																+	_starpu_mpi_handle_detached_request(req);
															
 
																+
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 static void _starpu_mpi_isend_size_callback(void *arg)
															
@@ -145,8 +149,8 @@ static void _starpu_mpi_isend_size_callback(void *arg)
 
																 static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
															
 
																-	if (!req->needs_unpacking)
															
 
																+	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
															
 
																+	if (req->user_datatype == 0)
															
 
																 	{
															
 
																 		req->count = 1;
															
 
																 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
															
@@ -172,7 +176,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
																 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	STARPU_ASSERT(public_req);
															
 
																 	struct _starpu_mpi_req *req;
															
@@ -181,17 +185,17 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
																 	STARPU_ASSERT(req);
															
 
																 	*public_req = req;
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
 
																 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
															
 
																 			      int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
@@ -200,13 +204,13 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 
																 	starpu_mpi_req req;
															
 
																 	MPI_Status status;
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	memset(&status, 0, sizeof(MPI_Status));
															
 
																 	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
															
 
																 	starpu_mpi_wait(&req, &status);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
@@ -218,39 +222,52 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 
																 static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	STARPU_ASSERT(req->ptr);
															
 
																 	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p datatype %p count %d req %p \n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, req->datatype, (int)req->count, &req->request);
															
 
																-        req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+	req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	/* somebody is perhaps waiting for the MPI request to be posted */
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																 	req->submitted = 1;
															
 
																 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+
															
 
																+	_starpu_mpi_handle_detached_request(req);
															
 
																+
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																+struct _starpu_mpi_irecv_size_callback
															
 
																+{
															
 
																+	starpu_data_handle_t handle;
															
 
																+	struct _starpu_mpi_req *req;
															
 
																+};
															
 
																+
															
 
																 static void _starpu_mpi_irecv_size_callback(void *arg)
															
 
																 {
															
 
																-	struct _starpu_mpi_req *req = (struct _starpu_mpi_req *) arg;
															
 
																+	struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
															
 
																+
															
 
																+	starpu_data_unregister(callback->handle);
															
 
																+	callback->req->ptr = malloc(callback->req->count);
															
 
																 #ifdef STARPU_DEVEL
															
 
																-#  warning TODO: are we sure that req->count can be used as we have not released count_handle?
															
 
																+#warning TODO: in some cases, callback->req->count is incorrect, we need to fix that
															
 
																 #endif
															
 
																-	req->ptr = malloc(req->count);
															
 
																-	_starpu_mpi_irecv_data_func(req);
															
 
																+	STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld\n", callback->req->count);
															
 
																+	_starpu_mpi_irecv_data_func(callback->req);
															
 
																+	free(callback);
															
 
																 }
															
 
																 static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																-	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype);
															
 
																-	if (!req->needs_unpacking)
															
 
																+	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
															
 
																+	if (req->user_datatype == 0)
															
 
																 	{
															
 
																 		req->count = 1;
															
 
																 		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
															
@@ -260,9 +277,11 @@ static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 
																 	{
															
 
																 		starpu_data_handle_t count_handle;
															
 
																+		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
															
 
																 		starpu_variable_data_register(&count_handle, 0, (uintptr_t)&req->count, sizeof(req->count));
															
 
																-		_starpu_mpi_irecv_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, req);
															
 
																-		starpu_data_unregister_submit(count_handle);
															
 
																+		callback->handle = count_handle;
															
 
																+		callback->req = req;
															
 
																+		_starpu_mpi_irecv_common(count_handle, req->srcdst, req->mpi_tag, req->comm, 1, _starpu_mpi_irecv_size_callback, callback);
															
 
																 	}
															
 
																 }
															
@@ -273,7 +292,7 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 
																 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	STARPU_ASSERT(public_req);
															
 
																 	struct _starpu_mpi_req *req;
															
@@ -282,15 +301,15 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
																 	STARPU_ASSERT(req);
															
 
																 	*public_req = req;
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
 
																 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
@@ -298,11 +317,11 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 
																 {
															
 
																 	starpu_mpi_req req;
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
															
 
																 	starpu_mpi_wait(&req, status);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
@@ -314,26 +333,26 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 
																 static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	/* Which is the mpi request we are waiting for ? */
															
 
																 	struct _starpu_mpi_req *req = waiting_req->other_request;
															
 
																 	req->ret = MPI_Wait(&req->request, waiting_req->status);
															
 
																-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	_starpu_mpi_handle_request_termination(req);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	int ret;
															
 
																 	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																 	STARPU_ASSERT(waiting_req);
															
 
																 	struct _starpu_mpi_req *req = *public_req;
															
 
																-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																 	/* We cannot try to complete a MPI request that was not actually posted
															
 
																 	 * to MPI yet. */
															
@@ -364,8 +383,8 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
																 	*public_req = NULL;
															
 
																 	free(req);
															
 
																-        //free(waiting_req);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	free(waiting_req);
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return ret;
															
 
																 }
															
@@ -377,13 +396,13 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
																 static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	/* Which is the mpi request we are testing for ? */
															
 
																 	struct _starpu_mpi_req *req = testing_req->other_request;
															
 
																-        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
															
 
																+	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
															
 
																 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
															
 
																-        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+	STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	if (*testing_req->flag)
															
 
																 	{
															
@@ -395,12 +414,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
																 	testing_req->completed = 1;
															
 
																 	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	int ret = 0;
															
 
																 	STARPU_ASSERT(public_req);
															
@@ -416,8 +435,8 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 	if (submitted)
															
 
																 	{
															
 
																 		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																-                STARPU_ASSERT(testing_req);
															
 
																-                //		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
															
 
																+		STARPU_ASSERT(testing_req);
															
 
																+		//		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
															
 
																 		/* Initialize the request structure */
															
 
																 		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
															
@@ -427,15 +446,15 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 		testing_req->other_request = req;
															
 
																 		testing_req->func = _starpu_mpi_test_func;
															
 
																 		testing_req->completed = 0;
															
 
																-                testing_req->request_type = TEST_REQ;
															
 
																+		testing_req->request_type = TEST_REQ;
															
 
																-                _STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																-                _starpu_mpi_submit_new_mpi_request(testing_req);
															
 
																+		_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																+		_starpu_mpi_submit_new_mpi_request(testing_req);
															
 
																 		/* We wait for the test request to finish */
															
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
															
 
																 		while (!(testing_req->completed))
															
 
																-                        _STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
															
 
																+			_STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
															
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
															
 
																 		ret = testing_req->ret;
															
@@ -448,13 +467,15 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 			*public_req = NULL;
															
 
																 			free(req);
															
 
																 		}
															
 
																+
															
 
																+		free(testing_req);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		*flag = 0;
															
 
																 	}
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return ret;
															
 
																 }
															
@@ -466,18 +487,18 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	barrier_req->ret = MPI_Barrier(barrier_req->comm);
															
 
																-        STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
															
 
																+	STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
															
 
																 	_starpu_mpi_handle_request_termination(barrier_req);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 int starpu_mpi_barrier(MPI_Comm comm)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	int ret;
															
 
																 	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																 	STARPU_ASSERT(barrier_req);
															
@@ -513,7 +534,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
																 	barrier_req->request_type = BARRIER_REQ;
															
 
																 	barrier_req->comm = comm;
															
 
																-        _STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																 	_starpu_mpi_submit_new_mpi_request(barrier_req);
															
 
																 	/* We wait for the MPI request to finish */
															
@@ -524,8 +545,8 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
																 	ret = barrier_req->ret;
															
 
																-        //free(waiting_req);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	//free(waiting_req);
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 	return ret;
															
 
																 }
															
@@ -538,31 +559,39 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
																 #ifdef STARPU_MPI_VERBOSE
															
 
																 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
															
 
																 {
															
 
																-        switch (request_type)
															
 
																-                {
															
 
																-                case SEND_REQ: return "SEND_REQ";
															
 
																-                case RECV_REQ: return "RECV_REQ";
															
 
																-                case WAIT_REQ: return "WAIT_REQ";
															
 
																-                case TEST_REQ: return "TEST_REQ";
															
 
																-                case BARRIER_REQ: return "BARRIER_REQ";
															
 
																-                default: return "unknown request type";
															
 
																-                }
															
 
																+	switch (request_type)
															
 
																+		{
															
 
																+		case SEND_REQ: return "SEND_REQ";
															
 
																+		case RECV_REQ: return "RECV_REQ";
															
 
																+		case WAIT_REQ: return "WAIT_REQ";
															
 
																+		case TEST_REQ: return "TEST_REQ";
															
 
																+		case BARRIER_REQ: return "BARRIER_REQ";
															
 
																+		default: return "unknown request type";
															
 
																+		}
															
 
																 }
															
 
																 #endif
															
 
																 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", _starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
															
 
																-        if (req->request_type != BARRIER_REQ)
															
 
																+	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
															
 
																 	{
															
 
																-		if (req->needs_unpacking)
															
 
																-			starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
															
 
																+		if (req->user_datatype == 1)
															
 
																+		{
															
 
																+			if (req->request_type == RECV_REQ)
															
 
																+				// req->ptr is freed by starpu_handle_unpack_data
															
 
																+				starpu_handle_unpack_data(req->data_handle, req->ptr, req->count);
															
 
																+			else
															
 
																+				free(req->ptr);
															
 
																+		}
															
 
																 		else
															
 
																-			MPI_Type_free(&req->datatype);
															
 
																-                starpu_data_release(req->data_handle);
															
 
																-        }
															
 
																+		{
															
 
																+			_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
															
 
																+		}
															
 
																+		starpu_data_release(req->data_handle);
															
 
																+	}
															
 
																 	if (req->request_type == RECV_REQ)
															
 
																 	{
															
@@ -579,23 +608,23 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
																 	req->completed = 1;
															
 
																 	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 static void _starpu_mpi_submit_new_mpi_request(void *arg)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	struct _starpu_mpi_req *req = arg;
															
 
																-        _STARPU_MPI_INC_POSTED_REQUESTS(-1);
															
 
																+	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	_starpu_mpi_req_list_push_front(new_requests, req);
															
 
																 	newer_requests = 1;
															
 
																-        _STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
															
 
																+	_STARPU_MPI_DEBUG("Pushing new request type %s\n", _starpu_mpi_request_type(req->request_type));
															
 
																 	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 #ifdef USE_STARPU_ACTIVITY
															
@@ -617,7 +646,7 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg __attribute__((unuse
 
																 static void _starpu_mpi_test_detached_requests(void)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																 	int flag;
															
 
																 	MPI_Status status;
															
 
																 	struct _starpu_mpi_req *req, *next_req;
															
@@ -632,7 +661,7 @@ static void _starpu_mpi_test_detached_requests(void)
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
															
 
																-                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
															
 
																+		//_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
															
 
																 		req->ret = MPI_Test(&req->request, &flag, &status);
															
 
																 		STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
@@ -644,29 +673,19 @@ static void _starpu_mpi_test_detached_requests(void)
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
															
 
																 		if (flag)
															
 
																+		{
															
 
																 			_starpu_mpi_req_list_erase(detached_requests, req);
															
 
																+			free(req);
															
 
																+		}
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning TODO fix memleak
															
 
																-#endif
															
 
																-		/* Detached requests are automatically allocated by the lib */
															
 
																-		//if (req->detached)
															
 
																-		//	free(req);
															
 
																 	}
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
															
 
																+static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																-        _STARPU_MPI_LOG_IN();
															
 
																-	STARPU_ASSERT(req);
															
 
																-
															
 
																-	/* submit the request to MPI */
															
 
																-        _STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
															
 
																-	req->func(req);
															
 
																-
															
 
																 	if (req->detached)
															
 
																 	{
															
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
@@ -681,59 +700,68 @@ static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
 
																 		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
															
 
																 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 	}
															
 
																-        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
															
 
																+{
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																+	STARPU_ASSERT(req);
															
 
																+
															
 
																+	/* submit the request to MPI */
															
 
																+	_STARPU_MPI_DEBUG("Handling new request type %s\n", _starpu_mpi_request_type(req->request_type));
															
 
																+	req->func(req);
															
 
																+
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																 struct _starpu_mpi_argc_argv
															
 
																 {
															
 
																+	int initialize_mpi;
															
 
																 	int *argc;
															
 
																 	char ***argv;
															
 
																 };
															
 
																 static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
															
 
																 {
															
 
																-     switch (thread_level)
															
 
																-     {
															
 
																-     case MPI_THREAD_SERIALIZED:
															
 
																-     {
															
 
																-	  _STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
															
 
																-	  break;
															
 
																-     }
															
 
																-     case MPI_THREAD_FUNNELED:
															
 
																-     {
															
 
																-	  _STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
															
 
																-	  break;
															
 
																-     }
															
 
																-     case MPI_THREAD_SINGLE:
															
 
																-     {
															
 
																-	  _STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
															
 
																-	  break;
															
 
																-     }
															
 
																-     }
															
 
																+	switch (thread_level)
															
 
																+	{
															
 
																+	case MPI_THREAD_SERIALIZED:
															
 
																+	{
															
 
																+		_STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
															
 
																+		break;
															
 
																+	}
															
 
																+	case MPI_THREAD_FUNNELED:
															
 
																+	{
															
 
																+		_STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
															
 
																+		break;
															
 
																+	}
															
 
																+	case MPI_THREAD_SINGLE:
															
 
																+	{
															
 
																+		_STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
															
 
																+		break;
															
 
																+	}
															
 
																+	}
															
 
																 }
															
 
																 static void *_starpu_mpi_progress_thread_func(void *arg)
															
 
																 {
															
 
																 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
															
 
																-	int flag;
															
 
																-	MPI_Initialized(&flag);
															
 
																-	_STARPU_DEBUG("MPI_Initialized %d\n", flag);
															
 
																-	if (flag == 0)
															
 
																+	if (argc_argv->initialize_mpi)
															
 
																 	{
															
 
																 		int thread_support;
															
 
																-                _STARPU_DEBUG("Calling MPI_Init_thread\n");
															
 
																+		_STARPU_DEBUG("Calling MPI_Init_thread\n");
															
 
																 		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
															
 
																 		{
															
 
																 			_STARPU_ERROR("MPI_Init_thread failed\n");
															
 
																-                }
															
 
																+		}
															
 
																 		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
															
 
																-        }
															
 
																+	}
															
 
																 	else
															
 
																 	{
															
 
																-	     int provided;
															
 
																-	     MPI_Query_thread(&provided);
															
 
																-	     _starpu_mpi_print_thread_level_support(provided, " has been initialized with");
															
 
																+		int provided;
															
 
																+		MPI_Query_thread(&provided);
															
 
																+		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
															
 
																 	}
															
 
																 	/* notify the main thread that the progression thread is ready */
															
@@ -754,7 +782,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		if (block)
															
 
																 		{
															
 
																-                        _STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
															
 
																+			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
															
 
																 			if (barrier_running)
															
 
																 				/* Tell mpi_barrier */
															
 
																 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
															
@@ -775,7 +803,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 			/* handling a request is likely to block for a while
															
 
																 			 * (on a sync_data_with_mem call), we want to let the
															
 
																 			 * application submit requests in the meantime, so we
															
 
																-			 * release the lock.  */
															
 
																+			 * release the lock. */
															
 
																 			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 			_starpu_mpi_handle_new_request(req);
															
 
																 			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
@@ -784,13 +812,13 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
															
 
																 	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
															
 
																-        STARPU_ASSERT(posted_requests == 0);
															
 
																+	STARPU_ASSERT(posted_requests == 0);
															
 
																-        if (flag == 0)
															
 
																+	if (argc_argv->initialize_mpi)
															
 
																 	{
															
 
																-                _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
															
 
																-                MPI_Finalize();
															
 
																-        }
															
 
																+		_STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
															
 
																+		MPI_Finalize();
															
 
																+	}
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
@@ -835,12 +863,12 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
																 	TRACE_MPI_BARRIER(rank, worldsize, random_number);
															
 
																-        _STARPU_MPI_DEBUG("unique key %x\n", random_number);
															
 
																+	_STARPU_MPI_DEBUG("unique key %x\n", random_number);
															
 
																 #endif
															
 
																 }
															
 
																 static
															
 
																-int _starpu_mpi_initialize(int *argc, char ***argv)
															
 
																+int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
															
 
																 {
															
 
																 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
															
 
																 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
															
@@ -850,12 +878,13 @@ int _starpu_mpi_initialize(int *argc, char ***argv)
 
																 	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
															
 
																 	detached_requests = _starpu_mpi_req_list_new();
															
 
																-        _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
															
 
																 	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
															
 
																+	argc_argv->initialize_mpi = initialize_mpi;
															
 
																 	argc_argv->argc = argc;
															
 
																 	argc_argv->argv = argv;
															
 
																-	_STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
															
 
																+	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	while (!running)
															
@@ -875,25 +904,25 @@ int _starpu_mpi_initialize(int *argc, char ***argv)
 
																 	_starpu_mpi_add_sync_point_in_fxt();
															
 
																 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
															
 
																-	_starpu_mpi_tables_init(MPI_COMM_WORLD);
															
 
																+	_starpu_mpi_cache_init(MPI_COMM_WORLD);
															
 
																 	return 0;
															
 
																 }
															
 
																-int starpu_mpi_init(int *argc, char ***argv)
															
 
																+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
															
 
																 {
															
 
																-        return _starpu_mpi_initialize(argc, argv);
															
 
																+	return _starpu_mpi_initialize(argc, argv, initialize_mpi);
															
 
																 }
															
 
																 int starpu_mpi_initialize(void)
															
 
																 {
															
 
																-        return _starpu_mpi_initialize(NULL, NULL);
															
 
																+	return _starpu_mpi_initialize(NULL, NULL, 0);
															
 
																 }
															
 
																 int starpu_mpi_initialize_extended(int *rank, int *world_size)
															
 
																 {
															
 
																 	int ret;
															
 
																-        ret = _starpu_mpi_initialize(NULL, NULL);
															
 
																+	ret = _starpu_mpi_initialize(NULL, NULL, 1);
															
 
																 	if (ret == 0)
															
 
																 	{
															
 
																 		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
															
@@ -908,7 +937,7 @@ int starpu_mpi_shutdown(void)
 
																 	void *value;
															
 
																 	int rank, world_size;
															
 
																-	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
															
 
																+	/* We need to get the rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
															
@@ -930,7 +959,7 @@ int starpu_mpi_shutdown(void)
 
																 	_starpu_mpi_comm_amounts_display(rank);
															
 
																 	_starpu_mpi_comm_amounts_free();
															
 
																-	_starpu_mpi_tables_free(world_size);
															
 
																+	_starpu_mpi_cache_free(world_size);
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -34,6 +34,7 @@ void _callback_collective(void *arg)
 
																 	if (callback_arg->nb == callback_arg->count)
															
 
																 	{
															
 
																 		callback_arg->callback(callback_arg->arg);
															
 
																+		free(callback_arg);
															
 
																 	}
															
 
																 }
															
@@ -46,9 +47,6 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
																 	MPI_Comm_rank(comm, &rank);
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning TODO: callback_arg needs to be free-ed
															
 
																-#endif
															
 
																 	callback_func = _callback_collective;
															
 
																 	callback_arg = malloc(sizeof(struct _callback_arg));
															
 
																 	callback_arg->count = 0;
															
@@ -64,7 +62,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
																 	if (callback_arg)
															
 
																 	{
															
 
																-		for(x = 0; x < count ;  x++)
															
 
																+		for(x = 0; x < count ; x++)
															
 
																 		{
															
 
																 			if (data_handles[x])
															
 
																 			{
															
@@ -83,7 +81,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
																 		}
															
 
																 	}
															
 
																-	for(x = 0; x < count ;  x++)
															
 
																+	for(x = 0; x < count ; x++)
															
 
																 	{
															
 
																 		if (data_handles[x])
															
 
																 		{
															
@@ -132,7 +130,7 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
																 	if (callback_arg)
															
 
																 	{
															
 
																-		for(x = 0; x < count ;  x++)
															
 
																+		for(x = 0; x < count ; x++)
															
 
																 		{
															
 
																 			if (data_handles[x])
															
 
																 			{
															
@@ -151,7 +149,7 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 
																 		}
															
 
																 	}
															
 
																-	for(x = 0; x < count ;  x++)
															
 
																+	for(x = 0; x < count ; x++)
															
 
																 	{
															
 
																 		if (data_handles[x])
															
 
																 		{
															
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -17,13 +17,14 @@
 
																 #include <starpu_mpi_datatype.h>
															
 
																-typedef int (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
															
 
																+typedef void (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
															
 
																+typedef void (*handle_free_datatype_func)(MPI_Datatype *);
															
 
																 /*
															
 
																  * 	Matrix
															
 
																  */
															
 
																-static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																 {
															
 
																 	int ret;
															
@@ -37,15 +38,13 @@ static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datat
 
																 	ret = MPI_Type_commit(datatype);
															
 
																 	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																-
															
 
																-	return 0;
															
 
																 }
															
 
																 /*
															
 
																  * 	Block
															
 
																  */
															
 
																-static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																 {
															
 
																 	int ret;
															
@@ -68,15 +67,13 @@ static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Dataty
 
																 	ret = MPI_Type_commit(datatype);
															
 
																 	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																-
															
 
																-	return 0;
															
 
																 }
															
 
																 /*
															
 
																  * 	Vector
															
 
																  */
															
 
																-static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																 {
															
 
																 	int ret;
															
@@ -88,15 +85,13 @@ static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datat
 
																 	ret = MPI_Type_commit(datatype);
															
 
																 	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																-
															
 
																-	return 0;
															
 
																 }
															
 
																 /*
															
 
																  * 	Variable
															
 
																  */
															
 
																-static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																 {
															
 
																 	int ret;
															
@@ -107,8 +102,6 @@ static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Dat
 
																 	ret = MPI_Type_commit(datatype);
															
 
																 	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																-
															
 
																-	return 0;
															
 
																 }
															
 
																 /*
															
@@ -127,21 +120,76 @@ static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID]
 
																 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
															
 
																 };
															
 
																-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
															
 
																 {
															
 
																 	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
															
 
																-	if (id <= STARPU_MULTIFORMAT_INTERFACE_ID)
															
 
																+	if (id < STARPU_MAX_INTERFACE_ID)
															
 
																 	{
															
 
																 		handle_to_datatype_func func = handle_to_datatype_funcs[id];
															
 
																 		STARPU_ASSERT(func);
															
 
																 		func(data_handle, datatype);
															
 
																-		return 0;
															
 
																+		*user_datatype = 0;
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		/* The datatype is not predefined by StarPU */
															
 
																 		*datatype = MPI_BYTE;
															
 
																-		return 1;
															
 
																+		*user_datatype = 1;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
															
 
																+{
															
 
																+	MPI_Type_free(datatype);
															
 
																+}
															
 
																+
															
 
																+static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
															
 
																+{
															
 
																+	int num_ints, num_adds, num_datatypes, combiner, i;
															
 
																+	int *array_of_ints;
															
 
																+	MPI_Aint *array_of_adds;
															
 
																+	MPI_Datatype *array_of_datatypes;
															
 
																+
															
 
																+	MPI_Type_get_envelope(*datatype, &num_ints, &num_adds, &num_datatypes, &combiner);
															
 
																+	if (combiner != MPI_COMBINER_NAMED)
															
 
																+	{
															
 
																+		array_of_ints = (int *) malloc(num_ints * sizeof(int));
															
 
																+		array_of_adds = (MPI_Aint *) malloc(num_adds * sizeof(MPI_Aint));
															
 
																+		array_of_datatypes = (MPI_Datatype *) malloc(num_datatypes * sizeof(MPI_Datatype));
															
 
																+		MPI_Type_get_contents(*datatype, num_ints, num_adds, num_datatypes, array_of_ints, array_of_adds, array_of_datatypes);
															
 
																+		for(i=0 ; i<num_datatypes ; i++)
															
 
																+		{
															
 
																+			_starpu_mpi_handle_free_complex_datatype(&array_of_datatypes[i]);
															
 
																+		}
															
 
																+		MPI_Type_free(datatype);
															
 
																+		free(array_of_ints);
															
 
																+		free(array_of_adds);
															
 
																+		free(array_of_datatypes);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
															
 
																+{
															
 
																+	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
															
 
																+	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
															
 
																+	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
															
 
																+	[STARPU_CSR_INTERFACE_ID]	= NULL,
															
 
																+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
															
 
																+	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
															
 
																+	[STARPU_VOID_INTERFACE_ID]      = NULL,
															
 
																+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
															
 
																+};
															
 
																+
															
 
																+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+{
															
 
																+	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
															
 
																+
															
 
																+	if (id < STARPU_MAX_INTERFACE_ID)
															
 
																+	{
															
 
																+		handle_free_datatype_func func = handle_free_datatype_funcs[id];
															
 
																+		STARPU_ASSERT(func);
															
 
																+		func(datatype);
															
 
																 	}
															
 
																+	/* else the datatype is not predefined by StarPU */
															
 
																 }
															
--- a/mpi/src/starpu_mpi_datatype.h
+++ b/mpi/src/starpu_mpi_datatype.h
@@ -24,7 +24,8 @@
 
																 extern "C" {
															
 
																 #endif
															
 
																-int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
															
 
																+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype);
															
 
																+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/mpi/src/starpu_mpi_insert_task.c
+++ b/mpi/src/starpu_mpi_insert_task.c
@@ -35,22 +35,22 @@ struct _starpu_data_entry
 
																 	void *data;
															
 
																 };
															
 
																-static struct _starpu_data_entry **sent_data = NULL;
															
 
																-static struct _starpu_data_entry **received_data = NULL;
															
 
																-static int cache_enabled=1;
															
 
																+static struct _starpu_data_entry **_cache_sent_data = NULL;
															
 
																+static struct _starpu_data_entry **_cache_received_data = NULL;
															
 
																+static int _cache_enabled=1;
															
 
																-void _starpu_mpi_tables_init(MPI_Comm comm)
															
 
																+void _starpu_mpi_cache_init(MPI_Comm comm)
															
 
																 {
															
 
																 	int nb_nodes;
															
 
																 	int i;
															
 
																-	cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
															
 
																-	if (cache_enabled == -1)
															
 
																+	_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
															
 
																+	if (_cache_enabled == -1)
															
 
																 	{
															
 
																-		cache_enabled = 1;
															
 
																+		_cache_enabled = 1;
															
 
																 	}
															
 
																-	if (cache_enabled == 0)
															
 
																+	if (_cache_enabled == 0)
															
 
																 	{
															
 
																 		if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
															
 
																 		return;
															
@@ -58,36 +58,119 @@ void _starpu_mpi_tables_init(MPI_Comm comm)
 
																 	MPI_Comm_size(comm, &nb_nodes);
															
 
																 	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
															
 
																-	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																-	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
															
 
																-	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																-	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
															
 
																+	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																+	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
															
 
																+	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																+	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
															
 
																 }
															
 
																-void _starpu_mpi_tables_free(int world_size)
															
 
																+void _starpu_mpi_cache_empty_tables(int world_size)
															
 
																 {
															
 
																 	int i;
															
 
																-	if (cache_enabled == 0) return;
															
 
																+	if (_cache_enabled == 0) return;
															
 
																 	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
															
 
																 	for(i=0 ; i<world_size ; i++)
															
 
																 	{
															
 
																 		struct _starpu_data_entry *entry, *tmp;
															
 
																-		HASH_ITER(hh, sent_data[i], entry, tmp)
															
 
																+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
															
 
																 		{
															
 
																-			HASH_DEL(sent_data[i], entry);
															
 
																+			HASH_DEL(_cache_sent_data[i], entry);
															
 
																 			free(entry);
															
 
																 		}
															
 
																-		HASH_ITER(hh, received_data[i], entry, tmp)
															
 
																+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
															
 
																 		{
															
 
																-			HASH_DEL(received_data[i], entry);
															
 
																+			HASH_DEL(_cache_received_data[i], entry);
															
 
																 			free(entry);
															
 
																 		}
															
 
																 	}
															
 
																-	free(sent_data);
															
 
																-	free(received_data);
															
 
																+}
															
 
																+
															
 
																+void _starpu_mpi_cache_free(int world_size)
															
 
																+{
															
 
																+	if (_cache_enabled == 0) return;
															
 
																+
															
 
																+	_starpu_mpi_cache_empty_tables(world_size);
															
 
																+	free(_cache_sent_data);
															
 
																+	free(_cache_received_data);
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
															
 
																+{
															
 
																+	int nb_nodes;
															
 
																+
															
 
																+	if (_cache_enabled == 0) return;
															
 
																+
															
 
																+	MPI_Comm_size(comm, &nb_nodes);
															
 
																+	_starpu_mpi_cache_empty_tables(nb_nodes);
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
															
 
																+{
															
 
																+	struct _starpu_data_entry *avail;
															
 
																+	int i, nb_nodes;
															
 
																+
															
 
																+	if (_cache_enabled == 0) return;
															
 
																+
															
 
																+	MPI_Comm_size(comm, &nb_nodes);
															
 
																+	for(i=0 ; i<nb_nodes ; i++)
															
 
																+	{
															
 
																+		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
															
 
																+		if (avail)
															
 
																+		{
															
 
																+			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
															
 
																+			HASH_DEL(_cache_sent_data[i], avail);
															
 
																+		}
															
 
																+		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
															
 
																+		if (avail)
															
 
																+		{
															
 
																+			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
															
 
																+			HASH_DEL(_cache_received_data[i], avail);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
															
 
																+{
															
 
																+	if (_cache_enabled == 0) return NULL;
															
 
																+
															
 
																+	struct _starpu_data_entry *already_received;
															
 
																+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
															
 
																+	if (already_received == NULL)
															
 
																+	{
															
 
																+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																+		entry->data = data;
															
 
																+		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
															
 
																+	}
															
 
																+	return already_received;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
															
 
																+{
															
 
																+	if (_cache_enabled == 0) return NULL;
															
 
																+
															
 
																+	struct _starpu_data_entry *already_sent;
															
 
																+	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
															
 
																+	if (already_sent == NULL)
															
 
																+	{
															
 
																+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																+		entry->data = data;
															
 
																+		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
															
 
																+		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
															
 
																+	}
															
 
																+	return already_sent;
															
 
																 }
															
 
																 static
															
@@ -150,47 +233,6 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 
																 }
															
 
																 static
															
 
																-void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
															
 
																-{
															
 
																-	if (cache_enabled == 0) return NULL;
															
 
																-
															
 
																-	struct _starpu_data_entry *already_received;
															
 
																-	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
															
 
																-	if (already_received == NULL)
															
 
																-	{
															
 
																-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																-		entry->data = data;
															
 
																-		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
															
 
																-	}
															
 
																-	return already_received;
															
 
																-}
															
 
																-
															
 
																-static
															
 
																-void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
															
 
																-{
															
 
																-	if (cache_enabled == 0) return NULL;
															
 
																-
															
 
																-	struct _starpu_data_entry *already_sent;
															
 
																-	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
															
 
																-	if (already_sent == NULL)
															
 
																-	{
															
 
																-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																-		entry->data = data;
															
 
																-		HASH_ADD_PTR(sent_data[dest], data, entry);
															
 
																-		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
															
 
																-	}
															
 
																-	return already_sent;
															
 
																-}
															
 
																-
															
 
																-static
															
 
																 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
															
 
																 {
															
 
																 	if (data && mode & STARPU_R)
															
@@ -266,9 +308,9 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
																 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
															
 
																 {
															
 
																-	if (cache_enabled)
															
 
																+	if (_cache_enabled)
															
 
																 	{
															
 
																-		if (mode & STARPU_W)
															
 
																+		if (mode & STARPU_W || mode & STARPU_REDUX)
															
 
																 		{
															
 
																 			if (do_execute)
															
 
																 			{
															
@@ -278,11 +320,11 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
																 				for(n=0 ; n<size ; n++)
															
 
																 				{
															
 
																 					struct _starpu_data_entry *already_sent;
															
 
																-					HASH_FIND_PTR(sent_data[n], &data, already_sent);
															
 
																+					HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
															
 
																 					if (already_sent)
															
 
																 					{
															
 
																 						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
															
 
																-						HASH_DEL(sent_data[n], already_sent);
															
 
																+						HASH_DEL(_cache_sent_data[n], already_sent);
															
 
																 					}
															
 
																 				}
															
 
																 			}
															
@@ -290,14 +332,14 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
																 			{
															
 
																 				int mpi_rank = starpu_data_get_rank(data);
															
 
																 				struct _starpu_data_entry *already_received;
															
 
																-				HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
															
 
																+				HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
															
 
																 				if (already_received)
															
 
																 				{
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
															
 
																 #endif
															
 
																 					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
															
 
																-					HASH_DEL(received_data[mpi_rank], already_received);
															
 
																+					HASH_DEL(_cache_received_data[mpi_rank], already_received);
															
 
																 					starpu_data_invalidate_submit(data);
															
 
																 				}
															
 
																 			}
															
@@ -324,7 +366,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
																 	int me, do_execute, xrank, nb_nodes;
															
 
																 	size_t *size_on_nodes;
															
 
																 	size_t arg_buffer_size = 0;
															
 
																-	char *arg_buffer;
															
 
																+	char *arg_buffer = NULL;
															
 
																 	int dest=0, inconsistent_execute;
															
 
																 	int current_data = 0;
															
@@ -339,8 +381,11 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
																 	va_start(varg_list, codelet);
															
 
																 	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
															
 
																-	va_start(varg_list, codelet);
															
 
																-	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
															
 
																+	if (arg_buffer_size)
															
 
																+	{
															
 
																+		va_start(varg_list, codelet);
															
 
																+		_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
															
 
																+	}
															
 
																 	/* Find out whether we are to execute the data because we own the data to be written to. */
															
 
																 	inconsistent_execute = 0;
															
@@ -437,13 +482,13 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
																 				xrank = i;
															
 
																 			}
															
 
																 		}
															
 
																-		free(size_on_nodes);
															
 
																 		if (xrank != -1)
															
 
																 		{
															
 
																 			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
															
 
																 			do_execute = 1;
															
 
																 		}
															
 
																 	}
															
 
																+	free(size_on_nodes);
															
 
																 	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
															
@@ -452,7 +497,6 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
																 		if (xrank == -1)
															
 
																 		{
															
 
																 			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
															
 
																-			free(size_on_nodes);
															
 
																 			return -EINVAL;
															
 
																 		}
															
 
																 		else
															
@@ -665,13 +709,11 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
																 	tag = starpu_data_get_tag(data_handle);
															
 
																 	if (rank == -1)
															
 
																 	{
															
 
																-		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																-		STARPU_ABORT();
															
 
																+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																 	}
															
 
																 	if (tag == -1)
															
 
																 	{
															
 
																-		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																-		STARPU_ABORT();
															
 
																+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																 	}
															
 
																 	MPI_Comm_rank(comm, &me);
															
--- a/mpi/src/starpu_mpi_insert_task.h
+++ b/mpi/src/starpu_mpi_insert_task.h
@@ -23,8 +23,8 @@
 
																 extern "C" {
															
 
																 #endif
															
 
																-void _starpu_mpi_tables_init(MPI_Comm comm);
															
 
																-void _starpu_mpi_tables_free(int world_size);
															
 
																+void _starpu_mpi_cache_init(MPI_Comm comm);
															
 
																+void _starpu_mpi_cache_free(int world_size);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -77,7 +77,7 @@ LIST_TYPE(_starpu_mpi_req,
 
																 	MPI_Datatype datatype;
															
 
																 	void *ptr;
															
 
																 	size_t count;
															
 
																-	int needs_unpacking;
															
 
																+	int user_datatype;
															
 
																 	/* who are we talking to ? */
															
 
																 	int srcdst;
															
@@ -91,8 +91,8 @@ LIST_TYPE(_starpu_mpi_req,
 
																 	int *flag;
															
 
																 	int ret;
															
 
																-	pthread_mutex_t req_mutex;
															
 
																-	pthread_cond_t req_cond;
															
 
																+	_starpu_pthread_mutex_t req_mutex;
															
 
																+	_starpu_pthread_cond_t req_cond;
															
 
																 	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -17,16 +17,33 @@
 
																 CC=$(MPICC)
															
 
																 CCLD=$(MPICC)
															
 
																-if STARPU_MPI_CHECK
															
 
																+if STARPU_HAVE_WINDOWS
															
 
																+LOADER_BIN		=
															
 
																+else
															
 
																+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
															
 
																+LOADER			=	loader
															
 
																+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
															
 
																+loader_SOURCES		=	../../tests/loader.c
															
 
																+endif
															
 
																+
															
 
																+if STARPU_QUICK_CHECK
															
 
																+MPI			=	$(MPIEXEC) -np 2
															
 
																+else
															
 
																+MPI			=	$(MPIEXEC) -np 4
															
 
																+endif
															
 
																+
															
 
																 if STARPU_HAVE_AM111
															
 
																-LOG_COMPILER	 	=	$(MPIEXEC) -np 4
															
 
																+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
															
 
																 else
															
 
																-TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 4
															
 
																+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
															
 
																 endif
															
 
																-TESTS			=	$(check_PROGRAMS)
															
 
																+
															
 
																+if STARPU_MPI_CHECK
															
 
																+TESTS			=	$(starpu_mpi_TESTS)
															
 
																 endif
															
 
																-check_PROGRAMS =
															
 
																+check_PROGRAMS = $(LOADER) $(starpu_mpi_TESTS)
															
 
																 BUILT_SOURCES =
															
@@ -49,14 +66,14 @@ endif
 
																 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
															
 
																 ########################
															
 
																 # Unit testcases       #
															
 
																 ########################
															
 
																-check_PROGRAMS +=				\
															
 
																+starpu_mpi_TESTS =				\
															
 
																 	pingpong				\
															
 
																 	mpi_test				\
															
 
																 	mpi_isend				\
															
@@ -77,7 +94,8 @@ check_PROGRAMS +=				\
 
																 	insert_task_owner_data			\
															
 
																 	multiple_send				\
															
 
																 	mpi_scatter_gather			\
															
 
																-	mpi_reduction
															
 
																+	mpi_reduction				\
															
 
																+	user_defined_datatype
															
 
																 noinst_PROGRAMS =				\
															
 
																 	pingpong				\
															
@@ -100,7 +118,8 @@ noinst_PROGRAMS =				\
 
																 	insert_task_owner_data			\
															
 
																 	multiple_send				\
															
 
																 	mpi_scatter_gather			\
															
 
																-	mpi_reduction
															
 
																+	mpi_reduction				\
															
 
																+	user_defined_datatype
															
 
																 mpi_isend_LDADD =					\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
@@ -144,6 +163,8 @@ mpi_scatter_gather_LDADD =			\
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																 mpi_reduction_LDADD =			\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+user_defined_datatype_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																 ring_SOURCES = ring.c
															
 
																 ring_async_SOURCES = ring_async.c
															
@@ -155,6 +176,8 @@ ring_async_implicit_SOURCES += ring_kernel.cu
 
																 endif
															
 
																 mpi_reduction_SOURCES = mpi_reduction.c
															
 
																 mpi_reduction_SOURCES += mpi_reduction_kernels.c
															
 
																+user_defined_datatype_SOURCES = user_defined_datatype.c
															
 
																+user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
															
 
																 showcheck:
															
 
																 	-cat $(TEST_LOGS) /dev/null
															
--- a/mpi/tests/block_interface.c
+++ b/mpi/tests/block_interface.c
@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	/* Node 0 will allocate a big block and only register an inner part of
															
@@ -132,6 +132,11 @@ int main(int argc, char **argv)
 
																 	FPRINTF(stdout, "Rank %d is done\n", rank);
															
 
																 	fflush(stdout);
															
 
																+	if (rank == 0 || rank == 1)
															
 
																+	{
															
 
																+		starpu_data_unregister(block_handle);
															
 
																+		free(block);
															
 
																+	}
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/block_interface_pinned.c
+++ b/mpi/tests/block_interface_pinned.c
@@ -43,7 +43,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	/* Node 0 will allocate a big block and only register an inner part of
															
@@ -132,6 +132,12 @@ int main(int argc, char **argv)
 
																 	}
															
 
																+	if (rank == 0 || rank == 1)
															
 
																+	{
															
 
																+		starpu_data_unregister(block_handle);
															
 
																+		starpu_free(block);
															
 
																+	}
															
 
																+
															
 
																 	FPRINTF(stdout, "Rank %d is done\n", rank);
															
 
																 	fflush(stdout);
															
--- a/mpi/tests/insert_task.c
+++ b/mpi/tests/insert_task.c
@@ -23,15 +23,15 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
															
 
																-        *x = (*x + *y) / 2;
															
 
																+	FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
															
 
																+	*x = (*x + *y) / 2;
															
 
																 }
															
 
																 struct starpu_codelet mycodelet =
															
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_RW, STARPU_R}
															
 
																 };
															
@@ -41,99 +41,101 @@ struct starpu_codelet mycodelet =
 
																 /* Returns the MPI node number where data indexes index is */
															
 
																 int my_distrib(int x, int y, int nb_nodes)
															
 
																 {
															
 
																-        return x % nb_nodes;
															
 
																+	return x % nb_nodes;
															
 
																 }
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int rank, size, x, y;
															
 
																-        int value=0, ret;
															
 
																-        unsigned matrix[X][Y];
															
 
																-        starpu_data_handle_t data_handles[X][Y];
															
 
																+	int rank, size, x, y;
															
 
																+	int value=0, ret;
															
 
																+	unsigned matrix[X][Y];
															
 
																+	starpu_data_handle_t data_handles[X][Y];
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        for(x = 0; x < X; x++)
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        matrix[x][y] = (rank+1)*10 + value;
															
 
																-                        value++;
															
 
																-                }
															
 
																-        }
															
 
																+			matrix[x][y] = (rank+1)*10 + value;
															
 
																+			value++;
															
 
																+		}
															
 
																+	}
															
 
																 #if 0
															
 
																-        for(x = 0; x < X; x++) {
															
 
																-                FPRINTF(stdout, "[%d] ", rank);
															
 
																-                for (y = 0; y < Y; y++) {
															
 
																-                        FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																-                }
															
 
																-                FPRINTF(stdout, "\n");
															
 
																-        }
															
 
																+	for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+		FPRINTF(stdout, "[%d] ", rank);
															
 
																+		for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+			FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																+		}
															
 
																+		FPRINTF(stdout, "\n");
															
 
																+	}
															
 
																 #endif
															
 
																-        for(x = 0; x < X; x++)
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        int mpi_rank = my_distrib(x, y, size);
															
 
																-                        if (mpi_rank == rank)
															
 
																+			int mpi_rank = my_distrib(x, y, size);
															
 
																+			if (mpi_rank == rank)
															
 
																 			{
															
 
																-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																-                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																-                        }
															
 
																-                        else
															
 
																+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																+			}
															
 
																+			else
															
 
																 			{
															
 
																-                                /* I don't own that index, but will need it for my computations */
															
 
																-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																-                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																-                        }
															
 
																-                        if (data_handles[x][y])
															
 
																+				/* I don't own that index, but will need it for my computations */
															
 
																+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																+			}
															
 
																+			if (data_handles[x][y])
															
 
																 			{
															
 
																-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																-                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+				starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																 			}
															
 
																-                }
															
 
																-        }
															
 
																+		}
															
 
																+	}
															
 
																-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
															
 
																+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																-        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																-        FPRINTF(stderr, "Waiting ...\n");
															
 
																-        starpu_task_wait_for_all();
															
 
																+	FPRINTF(stderr, "Waiting ...\n");
															
 
																+	starpu_task_wait_for_all();
															
 
																-        for(x = 0; x < X; x++)
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        if (data_handles[x][y])
															
 
																-                                starpu_data_unregister(data_handles[x][y]);
															
 
																-                }
															
 
																-        }
															
 
																+			if (data_handles[x][y])
															
 
																+				starpu_data_unregister(data_handles[x][y]);
															
 
																+		}
															
 
																+	}
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																 #if 0
															
 
																-        for(x = 0; x < X; x++)
															
 
																+	for(x = 0; x < X; x++)
															
 
																 	{
															
 
																-                FPRINTF(stdout, "[%d] ", rank);
															
 
																-                for (y = 0; y < Y; y++)
															
 
																+		FPRINTF(stdout, "[%d] ", rank);
															
 
																+		for (y = 0; y < Y; y++)
															
 
																 		{
															
 
																-                        FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																-                }
															
 
																-                FPRINTF(stdout, "\n");
															
 
																-        }
															
 
																+			FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																+		}
															
 
																+		FPRINTF(stdout, "\n");
															
 
																+	}
															
 
																 #endif
															
 
																 	return 0;
															
--- a/mpi/tests/insert_task_block.c
+++ b/mpi/tests/insert_task_block.c
@@ -25,137 +25,139 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
															
 
																 	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
															
 
																-        int i, j;
															
 
																-        unsigned sum=0;
															
 
																+	int i, j;
															
 
																+	unsigned sum=0;
															
 
																 	for (i = 0; i < nx; i++)
															
 
																 	{
															
 
																 		for (j = 0; j < ny; j++)
															
 
																 		{
															
 
																-                        sum += matrix[i+j*ld];
															
 
																-                }
															
 
																-        }
															
 
																+			sum += matrix[i+j*ld];
															
 
																+		}
															
 
																+	}
															
 
																 	for (i = 0; i < nx; i++)
															
 
																 	{
															
 
																 		for (j = 0; j < ny; j++)
															
 
																 		{
															
 
																-                        matrix[i+j*ld] = sum;///(nx*ny);
															
 
																-                }
															
 
																-        }
															
 
																+			matrix[i+j*ld] = sum;///(nx*ny);
															
 
																+		}
															
 
																+	}
															
 
																 }
															
 
																 struct starpu_codelet mycodelet =
															
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 1,
															
 
																+	.nbuffers = 1,
															
 
																 	.modes = {STARPU_RW}
															
 
																 };
															
 
																-#define SIZE       6
															
 
																-#define BLOCKS     3
															
 
																+#define SIZE 6
															
 
																+#define BLOCKS 3
															
 
																 /* Returns the MPI node number where data indexes index is */
															
 
																 int my_distrib(int x, int y, int nb_nodes)
															
 
																 {
															
 
																-        return x % nb_nodes;
															
 
																+	return x % nb_nodes;
															
 
																 }
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int rank, size, x, y;
															
 
																-        int ret, value=0;
															
 
																-        unsigned matrix[SIZE*SIZE];
															
 
																-        starpu_data_handle_t data_handles[SIZE][SIZE];
															
 
																+	int rank, size, x, y;
															
 
																+	int ret, value=0;
															
 
																+	unsigned matrix[SIZE*SIZE];
															
 
																+	starpu_data_handle_t data_handles[SIZE][SIZE];
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        for(x = 0; x < SIZE; x++)
															
 
																+	for(x = 0; x < SIZE; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < SIZE; y++)
															
 
																+		for (y = 0; y < SIZE; y++)
															
 
																 		{
															
 
																-                        matrix[x+y*SIZE] = rank*100 + value;
															
 
																-                        value++;
															
 
																-                }
															
 
																-        }
															
 
																+			matrix[x+y*SIZE] = rank*100 + value;
															
 
																+			value++;
															
 
																+		}
															
 
																+	}
															
 
																 #if 1
															
 
																-        for(x = 0; x < SIZE; x++) {
															
 
																-                FPRINTF(stdout, "[%d] ", rank);
															
 
																-                for (y = 0; y < SIZE; y++) {
															
 
																-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																-                }
															
 
																-                FPRINTF(stdout, "\n");
															
 
																-        }
															
 
																+	for(x = 0; x < SIZE; x++)
															
 
																+	{
															
 
																+		FPRINTF(stdout, "[%d] ", rank);
															
 
																+		for (y = 0; y < SIZE; y++)
															
 
																+		{
															
 
																+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																+		}
															
 
																+		FPRINTF(stdout, "\n");
															
 
																+	}
															
 
																 #endif
															
 
																-        for(x = 0; x < BLOCKS ;  x++)
															
 
																+	for(x = 0; x < BLOCKS ; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < BLOCKS; y++)
															
 
																+		for (y = 0; y < BLOCKS; y++)
															
 
																 		{
															
 
																-                        int mpi_rank = my_distrib(x, y, size);
															
 
																-                        if (mpi_rank == rank)
															
 
																+			int mpi_rank = my_distrib(x, y, size);
															
 
																+			if (mpi_rank == rank)
															
 
																 			{
															
 
																-                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																-                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																-                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																-                        }
															
 
																-                        else
															
 
																+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																+			}
															
 
																+			else
															
 
																 			{
															
 
																-                                /* I don't own that index, but will need it for my computations */
															
 
																-                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																-                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																-                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																-                        }
															
 
																-                        if (data_handles[x][y])
															
 
																+				/* I don't own that index, but will need it for my computations */
															
 
																+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																+			}
															
 
																+			if (data_handles[x][y])
															
 
																 			{
															
 
																-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																-                                starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
															
 
																+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+				starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
															
 
																 			}
															
 
																-                }
															
 
																-        }
															
 
																+		}
															
 
																+	}
															
 
																-        for(x = 0; x < BLOCKS; x++)
															
 
																+	for(x = 0; x < BLOCKS; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < BLOCKS; y++)
															
 
																+		for (y = 0; y < BLOCKS; y++)
															
 
																 		{
															
 
																-                        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																 						     STARPU_RW, data_handles[x][y],
															
 
																 						     0);
															
 
																 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+		}
															
 
																+	}
															
 
																-                }
															
 
																-        }
															
 
																-
															
 
																-        FPRINTF(stderr, "Waiting ...\n");
															
 
																-        starpu_task_wait_for_all();
															
 
																+	FPRINTF(stderr, "Waiting ...\n");
															
 
																+	starpu_task_wait_for_all();
															
 
																-        for(x = 0; x < BLOCKS; x++)
															
 
																+	for(x = 0; x < BLOCKS; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < BLOCKS; y++)
															
 
																+		for (y = 0; y < BLOCKS; y++)
															
 
																 		{
															
 
																-                        if (data_handles[x][y])
															
 
																-                                starpu_data_unregister(data_handles[x][y]);
															
 
																-                }
															
 
																-        }
															
 
																+			if (data_handles[x][y])
															
 
																+				starpu_data_unregister(data_handles[x][y]);
															
 
																+		}
															
 
																+	}
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																 #if 1
															
 
																-        for(x = 0; x < SIZE; x++)
															
 
																+	for(x = 0; x < SIZE; x++)
															
 
																 	{
															
 
																-                FPRINTF(stdout, "[%d] ", rank);
															
 
																-                for (y = 0; y < SIZE; y++) {
															
 
																-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																-                }
															
 
																-                FPRINTF(stdout, "\n");
															
 
																-        }
															
 
																+		FPRINTF(stdout, "[%d] ", rank);
															
 
																+		for (y = 0; y < SIZE; y++)
															
 
																+		{
															
 
																+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																+		}
															
 
																+		FPRINTF(stdout, "\n");
															
 
																+	}
															
 
																 #endif
															
 
																 	return 0;
															
--- a/mpi/tests/insert_task_cache.c
+++ b/mpi/tests/insert_task_cache.c
@@ -35,7 +35,7 @@ struct starpu_codelet mycodelet =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_RW, STARPU_R}
															
 
																 };
															
@@ -44,15 +44,15 @@ struct starpu_codelet mycodelet =
 
																 /* Returns the MPI node number where data indexes index is */
															
 
																 int my_distrib(int x)
															
 
																 {
															
 
																-        return x;
															
 
																+	return x;
															
 
																 }
															
 
																 void test_cache(int rank, int size, int enabled, size_t *comm_amount)
															
 
																 {
															
 
																-        int i;
															
 
																-        int ret;
															
 
																+	int i;
															
 
																+	int ret;
															
 
																 	unsigned v[2][N];
															
 
																-        starpu_data_handle_t data_handles[2];
															
 
																+	starpu_data_handle_t data_handles[2];
															
 
																 	char *string;
															
 
																 	string = malloc(50);
															
@@ -61,10 +61,10 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																-        for(i = 0; i < 2; i++)
															
 
																+	for(i = 0; i < 2; i++)
															
 
																 	{
															
 
																 		int mpi_rank = my_distrib(i);
															
 
																 		if (mpi_rank == rank)
															
@@ -80,30 +80,31 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
																 		}
															
 
																 		starpu_data_set_rank(data_handles[i], mpi_rank);
															
 
																 		starpu_data_set_tag(data_handles[i], i);
															
 
																-        }
															
 
																+	}
															
 
																-        for(i = 0; i < 5; i++)
															
 
																+	for(i = 0; i < 5; i++)
															
 
																 	{
															
 
																 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
															
 
																 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																 	}
															
 
																-        for(i = 0; i < 5; i++)
															
 
																+	for(i = 0; i < 5; i++)
															
 
																 	{
															
 
																 		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
															
 
																 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																 	}
															
 
																-        starpu_task_wait_for_all();
															
 
																+	starpu_task_wait_for_all();
															
 
																-        for(i = 0; i < 2; i++)
															
 
																+	for(i = 0; i < 2; i++)
															
 
																 	{
															
 
																 		starpu_data_unregister(data_handles[i]);
															
 
																-        }
															
 
																+	}
															
 
																 	starpu_mpi_comm_amounts_retrieve(comm_amount);
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																+	free(string);
															
 
																 }
															
 
																 int main(int argc, char **argv)
															
@@ -137,6 +138,10 @@ int main(int argc, char **argv)
 
																 	else
															
 
																 		result = 1;
															
 
																+	free(comm_amount_without_cache);
															
 
																+	free(comm_amount_with_cache);
															
 
																+	free(string);
															
 
																+
															
 
																 	MPI_Finalize();
															
 
																 	return !result;
															
 
																 }
															
--- a/mpi/tests/insert_task_owner.c
+++ b/mpi/tests/insert_task_owner.c
@@ -23,7 +23,7 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	int node;
															
 
																 	int rank;
															
 
																-        starpu_codelet_unpack_args(_args, &node);
															
 
																+	starpu_codelet_unpack_args(_args, &node);
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
															
@@ -34,7 +34,7 @@ struct starpu_codelet mycodelet_r_w =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_W}
															
 
																 };
															
@@ -42,7 +42,7 @@ struct starpu_codelet mycodelet_rw_r =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_RW, STARPU_R}
															
 
																 };
															
@@ -50,7 +50,7 @@ struct starpu_codelet mycodelet_rw_rw =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_RW, STARPU_RW}
															
 
																 };
															
@@ -58,7 +58,7 @@ struct starpu_codelet mycodelet_w_r =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_W, STARPU_R}
															
 
																 };
															
@@ -66,109 +66,112 @@ struct starpu_codelet mycodelet_r_r =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_R, STARPU_R}
															
 
																 };
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int ret, rank, size, err, node;
															
 
																-        int x0=32, x1=23;
															
 
																-        starpu_data_handle_t data_handlesx0;
															
 
																-        starpu_data_handle_t data_handlesx1;
															
 
																+	int ret, rank, size, err, node;
															
 
																+	int x0=32, x1=23;
															
 
																+	starpu_data_handle_t data_handlesx0;
															
 
																+	starpu_data_handle_t data_handlesx1;
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        if (rank == 0)
															
 
																+	if (rank != 0 && rank != 1) goto end;
															
 
																+
															
 
																+	if (rank == 0)
															
 
																 	{
															
 
																-                starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
															
 
																-                starpu_data_set_rank(data_handlesx0, rank);
															
 
																+		starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
															
 
																+		starpu_data_set_rank(data_handlesx0, rank);
															
 
																 		starpu_data_set_tag(data_handlesx0, 0);
															
 
																-                starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
															
 
																-                starpu_data_set_rank(data_handlesx1, 1);
															
 
																+		starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
															
 
																+		starpu_data_set_rank(data_handlesx1, 1);
															
 
																 		starpu_data_set_tag(data_handlesx1, 1);
															
 
																-        }
															
 
																-        else if (rank == 1)
															
 
																+	}
															
 
																+	else if (rank == 1)
															
 
																 	{
															
 
																-                starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
															
 
																-                starpu_data_set_rank(data_handlesx1, rank);
															
 
																+		starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
															
 
																+		starpu_data_set_rank(data_handlesx1, rank);
															
 
																 		starpu_data_set_tag(data_handlesx1, 1);
															
 
																-                starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
															
 
																-                starpu_data_set_rank(data_handlesx0, 0);
															
 
																+		starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
															
 
																+		starpu_data_set_rank(data_handlesx0, 0);
															
 
																 		starpu_data_set_tag(data_handlesx0, 0);
															
 
																-        }
															
 
																-
															
 
																-	if (rank != 0 && rank != 1) goto end;
															
 
																+	}
															
 
																 	node = starpu_data_get_rank(data_handlesx1);
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																 	node = starpu_data_get_rank(data_handlesx0);
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
															
 
																 				     0);
															
 
																-        assert(err == -EINVAL);
															
 
																+	assert(err == -EINVAL);
															
 
																 	node = 1;
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																 	node = 0;
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																 	node = 0;
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																-        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																-           going to overwrite the node even though the data model clearly specifies
															
 
																-           which node is going to execute the codelet */
															
 
																+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																+	   going to overwrite the node even though the data model clearly specifies
															
 
																+	   which node is going to execute the codelet */
															
 
																 	node = 0;
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																-        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																-           going to overwrite the node even though the data model clearly specifies
															
 
																-           which node is going to execute the codelet */
															
 
																+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																+	   going to overwrite the node even though the data model clearly specifies
															
 
																+	   which node is going to execute the codelet */
															
 
																 	node = 0;
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
															
 
																 				     STARPU_VALUE, &node, sizeof(node),
															
 
																 				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																+	assert(err == 0);
															
 
																-end:
															
 
																 	fprintf(stderr, "Waiting ...\n");
															
 
																-        starpu_task_wait_for_all();
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_data_unregister(data_handlesx0);
															
 
																+	starpu_data_unregister(data_handlesx1);
															
 
																+
															
 
																+end:
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/insert_task_owner2.c
+++ b/mpi/tests/insert_task_owner2.c
@@ -25,66 +25,66 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
															
 
																 	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
															
 
																-//        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																-//
															
 
																-//        *x2 = 45;
															
 
																-//        *y = 144;
															
 
																-//
															
 
																-        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																-        *y = (*x0 + *x1) * 100;
															
 
																-        *x1 = 12;
															
 
																-        *x2 = 24;
															
 
																-        *x0 = 36;
															
 
																-        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+	//FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+	//*x2 = 45;
															
 
																+	//*y = 144;
															
 
																+
															
 
																+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+	*y = (*x0 + *x1) * 100;
															
 
																+	*x1 = 12;
															
 
																+	*x2 = 24;
															
 
																+	*x0 = 36;
															
 
																+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																 }
															
 
																 struct starpu_codelet mycodelet =
															
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 4,
															
 
																+	.nbuffers = 4,
															
 
																 	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
															
 
																 };
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int rank, size, err;
															
 
																-        int x[3], y=0;
															
 
																-        int i, ret;
															
 
																-        starpu_data_handle_t data_handles[4];
															
 
																+	int rank, size, err;
															
 
																+	int x[3], y=0;
															
 
																+	int i, ret;
															
 
																+	starpu_data_handle_t data_handles[4];
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        if (rank == 0)
															
 
																+	if (rank == 0)
															
 
																 	{
															
 
																-                for(i=0 ; i<3 ; i++)
															
 
																+		for(i=0 ; i<3 ; i++)
															
 
																 		{
															
 
																-                        x[i] = 10*(i+1);
															
 
																-                        starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
															
 
																-                }
															
 
																-                y = -1;
															
 
																-                starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
															
 
																-        }
															
 
																-        else if (rank == 1)
															
 
																+			x[i] = 10*(i+1);
															
 
																+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
															
 
																+		}
															
 
																+		y = -1;
															
 
																+		starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+	}
															
 
																+	else if (rank == 1)
															
 
																 	{
															
 
																-                for(i=0 ; i<3 ; i++)
															
 
																+		for(i=0 ; i<3 ; i++)
															
 
																 		{
															
 
																-                        x[i] = -1;
															
 
																-                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																-                }
															
 
																-                y=200;
															
 
																-                starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
															
 
																-        } else
															
 
																+			x[i] = -1;
															
 
																+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+		}
															
 
																+		y=200;
															
 
																+		starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
															
 
																+	}
															
 
																+	else
															
 
																 	{
															
 
																-                for(i=0 ; i<4 ; i++)
															
 
																-                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+		for(i=0 ; i<4 ; i++)
															
 
																+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																 	}
															
 
																-        FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
															
 
																+	FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
															
 
																 	for(i=0 ; i<3 ; i++)
															
 
																 	{
															
@@ -94,23 +94,24 @@ int main(int argc, char **argv)
 
																 	starpu_data_set_rank(data_handles[3], 1);
															
 
																 	starpu_data_set_tag(data_handles[3], 3);
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																-                                     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
															
 
																-                                     STARPU_W, data_handles[2],
															
 
																-                                     STARPU_W, data_handles[3],
															
 
																-                                     STARPU_EXECUTE_ON_NODE, 1, 0);
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
															
 
																+				     STARPU_W, data_handles[2],
															
 
																+				     STARPU_W, data_handles[3],
															
 
																+				     STARPU_EXECUTE_ON_NODE, 1, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
															
 
																-        starpu_task_wait_for_all();
															
 
																+	starpu_task_wait_for_all();
															
 
																-        int *values = malloc(4 * sizeof(int *));
															
 
																-        for(i=0 ; i<4 ; i++)
															
 
																+	int *values = malloc(4 * sizeof(int *));
															
 
																+	for(i=0 ; i<4 ; i++)
															
 
																 	{
															
 
																-                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																-		if (rank == 0) {
															
 
																+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																 			starpu_data_acquire(data_handles[i], STARPU_R);
															
 
																 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
															
 
																 		}
															
 
																-        }
															
 
																+	}
															
 
																         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
															
 
																         FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
															
--- a/mpi/tests/insert_task_owner_data.c
+++ b/mpi/tests/insert_task_owner_data.c
@@ -31,68 +31,73 @@ struct starpu_codelet mycodelet =
 
																 {
															
 
																 	.where = STARPU_CPU,
															
 
																 	.cpu_funcs = {func_cpu, NULL},
															
 
																-        .nbuffers = 2,
															
 
																+	.nbuffers = 2,
															
 
																 	.modes = {STARPU_RW, STARPU_RW}
															
 
																 };
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int rank, size, err;
															
 
																-        int x[2];
															
 
																-        int ret, i;
															
 
																-        starpu_data_handle_t data_handles[2];
															
 
																+	int rank, size, err;
															
 
																+	int x[2];
															
 
																+	int ret, i;
															
 
																+	starpu_data_handle_t data_handles[2];
															
 
																 	int values[2];
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																-        if (rank == 0)
															
 
																+	if (rank == 0)
															
 
																 	{
															
 
																 		x[0] = 11;
															
 
																 		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
															
 
																 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
															
 
																-        }
															
 
																-        else if (rank == 1)
															
 
																+	}
															
 
																+	else if (rank == 1)
															
 
																 	{
															
 
																 		x[1] = 12;
															
 
																 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
															
 
																 		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
															
 
																-        }
															
 
																+	}
															
 
																 	else
															
 
																 	{
															
 
																 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
															
 
																 		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
															
 
																-        }
															
 
																+	}
															
 
																 	starpu_data_set_rank(data_handles[0], 0);
															
 
																 	starpu_data_set_tag(data_handles[0], 0);
															
 
																 	starpu_data_set_rank(data_handles[1], 1);
															
 
																 	starpu_data_set_tag(data_handles[1], 1);
															
 
																-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																-                                     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
															
 
																-                                     STARPU_EXECUTE_ON_DATA, data_handles[1],
															
 
																+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
															
 
																+				     STARPU_EXECUTE_ON_DATA, data_handles[1],
															
 
																 				     0);
															
 
																-        assert(err == 0);
															
 
																-        starpu_task_wait_for_all();
															
 
																+	assert(err == 0);
															
 
																+	starpu_task_wait_for_all();
															
 
																-        for(i=0 ; i<2 ; i++)
															
 
																+	for(i=0 ; i<2 ; i++)
															
 
																 	{
															
 
																-                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																-		if (rank == 0) {
															
 
																+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																 			starpu_data_acquire(data_handles[i], STARPU_R);
															
 
																 			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
															
 
																+			starpu_data_release(data_handles[i]);
															
 
																 		}
															
 
																-        }
															
 
																-        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
															
 
																+	}
															
 
																+	FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
															
 
																 	ret = 0;
															
 
																 	if (rank == 0 && (values[0] != 12 || values[1] != 144))
															
 
																 		ret = EXIT_FAILURE;
															
 
																+	starpu_data_unregister(data_handles[0]);
															
 
																+	starpu_data_unregister(data_handles[1]);
															
 
																+
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -18,7 +18,11 @@
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -18,7 +18,11 @@
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -19,14 +19,18 @@
 
																 #include <common/utils.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
 
																 starpu_data_handle_t tab_handle;
															
 
																-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
															
 
																+static _starpu_pthread_mutex_t mutex = _STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																+static _starpu_pthread_cond_t cond = _STARPU_PTHREAD_COND_INITIALIZER;
															
 
																 void callback(void *arg __attribute__((unused)))
															
 
																 {
															
@@ -58,7 +62,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -18,7 +18,11 @@
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
@@ -43,7 +47,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -20,21 +20,22 @@
 
																 #include <pthread.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																 #define SIZE	16
															
 
																-static float *tab;
															
 
																-static starpu_data_handle_t tab_handle;
															
 
																+static _starpu_pthread_mutex_t mutex = _STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																+static _starpu_pthread_cond_t cond = _STARPU_PTHREAD_COND_INITIALIZER;
															
 
																-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
															
 
																-
															
 
																-void callback(void *arg __attribute__((unused)))
															
 
																+void callback(void *arg)
															
 
																 {
															
 
																-	unsigned *sent = arg;
															
 
																+	unsigned *completed = arg;
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																-	*sent = 1;
															
 
																+	*completed = 1;
															
 
																 	_STARPU_PTHREAD_COND_SIGNAL(&cond);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 }
															
@@ -42,6 +43,8 @@ void callback(void *arg __attribute__((unused)))
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int ret, rank, size;
															
 
																+	float *tab;
															
 
																+	starpu_data_handle_t tab_handle;
															
 
																 	MPI_Init(NULL, NULL);
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
@@ -58,7 +61,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
@@ -83,8 +86,13 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																-			MPI_Status status;
															
 
																-			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
															
 
																+			int received = 0;
															
 
																+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
															
 
																+
															
 
																+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+			while (!received)
															
 
																+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
															
 
																+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 		}
															
 
																 	}
															
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -65,16 +65,16 @@ int my_distrib(int x, int nb_nodes)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int my_rank, size, x, y, i;
															
 
																-        long int *vector;
															
 
																+	long int *vector;
															
 
																 	long int dot, sum=0;
															
 
																-        starpu_data_handle_t *handles;
															
 
																+	starpu_data_handle_t *handles;
															
 
																 	starpu_data_handle_t dot_handle;
															
 
																 	int nb_elements, step, loops;
															
 
																 	int ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 
																 	loops = 5;
															
 
																 	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
															
 
																-        for(x = 0; x < nb_elements; x+=step)
															
 
																+	for(x = 0; x < nb_elements; x+=step)
															
 
																 	{
															
 
																 		int mpi_rank = my_distrib(x/step, size);
															
 
																 		if (mpi_rank == my_rank)
															
@@ -94,7 +94,7 @@ int main(int argc, char **argv)
 
																 				vector[x+y] = x+y+1;
															
 
																 			}
															
 
																 		}
															
 
																-        }
															
 
																+	}
															
 
																 	if (my_rank == 0) {
															
 
																 		dot = 14;
															
 
																 		sum = (nb_elements * (nb_elements + 1)) / 2;
															
@@ -109,7 +109,7 @@ int main(int argc, char **argv)
 
																 	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
															
 
																-        for(x = 0; x < nb_elements; x+=step)
															
 
																+	for(x = 0; x < nb_elements; x+=step)
															
 
																 	{
															
 
																 		int mpi_rank = my_distrib(x/step, size);
															
 
																 		if (mpi_rank == my_rank)
															
@@ -146,10 +146,10 @@ int main(int argc, char **argv)
 
																 		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
															
 
																 	}
															
 
																-        fprintf(stderr, "Waiting ...\n");
															
 
																-        starpu_task_wait_for_all();
															
 
																+	fprintf(stderr, "Waiting ...\n");
															
 
																+	starpu_task_wait_for_all();
															
 
																-        for(x = 0; x < nb_elements; x+=step)
															
 
																+	for(x = 0; x < nb_elements; x+=step)
															
 
																 	{
															
 
																 		if (handles[x]) starpu_data_unregister(handles[x]);
															
 
																 	}
															
@@ -165,10 +165,10 @@ int main(int argc, char **argv)
 
																 	if (my_rank == 0)
															
 
																 	{
															
 
																-                fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
															
 
																-                fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
															
 
																+		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
															
 
																+		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
															
 
																 		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
															
 
																-        }
															
 
																+	}
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/tests/mpi_scatter_gather.c
+++ b/mpi/tests/mpi_scatter_gather.c
@@ -19,7 +19,7 @@
 
																 /* Returns the MPI node number where data indexes index is */
															
 
																 int my_distrib(int x, int y, int nb_nodes)
															
 
																 {
															
 
																-        return (x+y) % nb_nodes;
															
 
																+	return (x+y) % nb_nodes;
															
 
																 }
															
 
																 void cpu_codelet(void *descr[], void *_args)
															
@@ -32,7 +32,7 @@ void cpu_codelet(void *descr[], void *_args)
 
																 	float factor;
															
 
																 	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																-        starpu_codelet_unpack_args(_args, &rank);
															
 
																+	starpu_codelet_unpack_args(_args, &rank);
															
 
																 	factor = block[0];
															
 
																 	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
															
@@ -68,9 +68,9 @@ void rcallback(void *arg __attribute__((unused)))
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-        int rank, nodes;
															
 
																+	int rank, nodes;
															
 
																 	float ***bmat = NULL;
															
 
																-        starpu_data_handle_t *data_handles;
															
 
																+	starpu_data_handle_t *data_handles;
															
 
																 	unsigned i,j,x,y;
															
@@ -81,7 +81,7 @@ int main(int argc, char **argv)
 
																 	int ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
															
@@ -135,10 +135,10 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 	/* Allocate data handles and register data to StarPU */
															
 
																-        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
															
 
																-        for(x = 0; x < nblocks ;  x++)
															
 
																+	data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
															
 
																+	for(x = 0; x < nblocks ; x++)
															
 
																 	{
															
 
																-                for (y = 0; y < nblocks; y++)
															
 
																+		for (y = 0; y < nblocks; y++)
															
 
																 		{
															
 
																 			int mpi_rank = my_distrib(x, y, nodes);
															
 
																 			if (rank == 0)
															
@@ -158,19 +158,19 @@ int main(int argc, char **argv)
 
																 				/* I know it's useless to allocate anything for this */
															
 
																 				data_handles[x+y*nblocks] = NULL;
															
 
																 			}
															
 
																-                        if (data_handles[x+y*nblocks])
															
 
																+			if (data_handles[x+y*nblocks])
															
 
																 			{
															
 
																-                                starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
															
 
																-                                starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
															
 
																+				starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
															
 
																+				starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
															
 
																 			}
															
 
																-                }
															
 
																-        }
															
 
																+		}
															
 
																+	}
															
 
																 	/* Scatter the matrix among the nodes */
															
 
																 	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
															
 
																 	/* Calculation */
															
 
																-	for(x = 0; x < nblocks*nblocks ;  x++)
															
 
																+	for(x = 0; x < nblocks*nblocks ; x++)
															
 
																 	{
															
 
																 		if (data_handles[x])
															
 
																 		{
															
@@ -222,7 +222,7 @@ int main(int argc, char **argv)
 
																 #endif
															
 
																 	// Free memory
															
 
																-        free(data_handles);
															
 
																+	free(data_handles);
															
 
																 	if (rank == 0)
															
 
																 	{
															
 
																 		for(x=0 ; x<nblocks ; x++)
															
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -18,7 +18,12 @@
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																+
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
@@ -43,7 +48,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
@@ -60,7 +65,7 @@ int main(int argc, char **argv)
 
																 		if ((loop % 2) == (rank%2))
															
 
																 		{
															
 
																-                        starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
--- a/mpi/tests/multiple_send.c
+++ b/mpi/tests/multiple_send.c
@@ -22,15 +22,15 @@
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int ret, rank, size;
															
 
																-        unsigned send[2] = {42, 11};
															
 
																-        unsigned recv[2] = {33, 33};
															
 
																-        starpu_mpi_req req[2];
															
 
																-        starpu_data_handle_t send_handle[2];
															
 
																-        starpu_data_handle_t recv_handle[2];
															
 
																+	unsigned send[2] = {42, 11};
															
 
																+	unsigned recv[2] = {33, 33};
															
 
																+	starpu_mpi_req req[2];
															
 
																+	starpu_data_handle_t send_handle[2];
															
 
																+	starpu_data_handle_t recv_handle[2];
															
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(&argc, &argv);
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
@@ -40,8 +40,8 @@ int main(int argc, char **argv)
 
																 		if (rank == 0)
															
 
																 			FPRINTF(stderr, "We need at least 2 processes.\n");
															
 
																-                starpu_mpi_shutdown();
															
 
																-                starpu_shutdown();
															
 
																+		starpu_mpi_shutdown();
															
 
																+		starpu_shutdown();
															
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
@@ -50,42 +50,47 @@ int main(int argc, char **argv)
 
																 	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
															
 
																 	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
															
 
																-        if (rank == 0)
															
 
																+	if (rank == 0)
															
 
																 	{
															
 
																-                starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
															
 
																-                starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
															
 
																-        }
															
 
																-        else if (rank == 1)
															
 
																+		starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
															
 
																+		starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
															
 
																+	}
															
 
																+	else if (rank == 1)
															
 
																 	{
															
 
																-                starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
															
 
																-                starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
															
 
																-        }
															
 
																+		starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
															
 
																+		starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
															
 
																+	}
															
 
																-        if (rank == 0 || rank == 1)
															
 
																+	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-                int nb_req=2;
															
 
																-                while (nb_req)
															
 
																+		int nb_req=2;
															
 
																+		while (nb_req)
															
 
																 		{
															
 
																-                        int r=0;
															
 
																-                        for(r=0 ; r<2 ; r++)
															
 
																+			int r=0;
															
 
																+			for(r=0 ; r<2 ; r++)
															
 
																 			{
															
 
																-                                if (req[r])
															
 
																+				if (req[r])
															
 
																 				{
															
 
																-                                        int finished = 0;
															
 
																-                                        MPI_Status status;
															
 
																-                                        starpu_mpi_test(&req[r], &finished, &status);
															
 
																-                                        STARPU_ASSERT(finished != -1);
															
 
																-                                        if (finished)
															
 
																+					int finished = 0;
															
 
																+					MPI_Status status;
															
 
																+					starpu_mpi_test(&req[r], &finished, &status);
															
 
																+					STARPU_ASSERT(finished != -1);
															
 
																+					if (finished)
															
 
																 					{
															
 
																-                                                FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
															
 
																-                                                req[r] = NULL;
															
 
																-                                                nb_req--;
															
 
																-                                        }
															
 
																-                                }
															
 
																-                        }
															
 
																-                }
															
 
																-        }
															
 
																-        FPRINTF(stderr, "[%d] All requests finished\n", rank);
															
 
																+						FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
															
 
																+						req[r] = NULL;
															
 
																+						nb_req--;
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	FPRINTF(stderr, "[%d] All requests finished\n", rank);
															
 
																+
															
 
																+	starpu_data_unregister(send_handle[0]);
															
 
																+	starpu_data_unregister(send_handle[1]);
															
 
																+	starpu_data_unregister(recv_handle[0]);
															
 
																+	starpu_data_unregister(recv_handle[1]);
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -18,7 +18,12 @@
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																-#define NITER	2048
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER	16
															
 
																+#else
															
 
																+#  define NITER	2048
															
 
																+#endif
															
 
																+
															
 
																 #define SIZE	16
															
 
																 float *tab;
															
@@ -43,7 +48,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	tab = malloc(SIZE*sizeof(float));
															
@@ -69,9 +74,11 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 	}
															
 
																+	starpu_data_unregister(tab_handle);
															
 
																+	free(tab);
															
 
																+
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																-
															
 
																 	MPI_Finalize();
															
 
																 	return 0;
															
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
@@ -119,6 +119,7 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 	}
															
 
																+	starpu_data_unregister(token_handle);
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
@@ -123,6 +123,7 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 	}
															
 
																+	starpu_data_unregister(token_handle);
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
--- a/mpi/tests/ring_async_implicit.c
+++ b/mpi/tests/ring_async_implicit.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv)
 
																 	ret = starpu_init(NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	ret = starpu_mpi_init(NULL, NULL);
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 1);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
@@ -118,13 +118,14 @@ int main(int argc, char **argv)
 
																 	starpu_task_wait_for_all();
															
 
																+	starpu_data_unregister(token_handle);
															
 
																 	starpu_mpi_shutdown();
															
 
																 	starpu_shutdown();
															
 
																 	if (rank == last_rank)
															
 
																 	{
															
 
																-                FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
															
 
																-                STARPU_ASSERT(token == nloops*size);
															
 
																+		FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
															
 
																+		STARPU_ASSERT(token == nloops*size);
															
 
																 	}
															
 
																 	return 0;
															
--- a/mpi/tests/user_defined_datatype.c
+++ b/mpi/tests/user_defined_datatype.c
@@ -0,0 +1,97 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <interface/complex_interface.h>
															
 
																+#include <interface/complex_codelet.h>
															
 
																+
															
 
																+void display_double_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	double *foo = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	fprintf(stderr, "foo = %f\n", *foo);
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet double_display =
															
 
																+{
															
 
																+	.cpu_funcs = {display_double_codelet, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_R}
															
 
																+};
															
 
																+
															
 
																+void test_handle(starpu_data_handle_t handle, struct starpu_codelet *codelet, int rank)
															
 
																+{
															
 
																+	starpu_data_set_rank(handle, 1);
															
 
																+	starpu_data_set_tag(handle, 42);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		starpu_insert_task(codelet, STARPU_R, handle, 0);
															
 
																+	}
															
 
																+	starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handle, 0, NULL, NULL);
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		starpu_insert_task(codelet, STARPU_R, handle, 0);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int rank, nodes;
															
 
																+	int ret;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
															
 
																+
															
 
																+	if (nodes < 2)
															
 
																+	{
															
 
																+		fprintf(stderr, "This program needs at least 2 nodes\n");
															
 
																+		ret = 77;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		double real[2] = {0.0, 0.0};
															
 
																+		double imaginary[2] = {0.0, 0.0};
															
 
																+		double foo=8;
															
 
																+		starpu_data_handle_t handle_complex;
															
 
																+		starpu_data_handle_t handle_var;
															
 
																+
															
 
																+		if (rank == 1)
															
 
																+		{
															
 
																+			foo = 42;
															
 
																+			real[0] = 12.0;
															
 
																+			real[1] = 45.0;
															
 
																+			imaginary[0] = 7.0;
															
 
																+			imaginary[1] = 42.0;
															
 
																+		}
															
 
																+		starpu_complex_data_register(&handle_complex, 0, real, imaginary, 2);
															
 
																+		starpu_variable_data_register(&handle_var, 0, (uintptr_t)&foo, sizeof(double));
															
 
																+
															
 
																+		test_handle(handle_var, &double_display, rank);
															
 
																+		test_handle(handle_complex, &cl_display, rank);
															
 
																+
															
 
																+		starpu_data_unregister(handle_complex);
															
 
																+		starpu_data_unregister(handle_var);
															
 
																+	}
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/socl/examples/Makefile.am
+++ b/socl/examples/Makefile.am
@@ -63,6 +63,7 @@ SOCL_EXAMPLES +=		\
 
																 basic_basic_SOURCES = basic/basic.c
															
 
																 clinfo_clinfo_SOURCES = clinfo/clinfo.c
															
 
																 matmul_matmul_SOURCES = matmul/matmul.c
															
 
																+matmul_matmul_LDADD = -lm
															
 
																 mansched_mansched_SOURCES = mansched/mansched.c
															
 
																 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
															
--- a/socl/examples/clinfo/clinfo.c
+++ b/socl/examples/clinfo/clinfo.c
@@ -288,9 +288,9 @@ main(void) {
 
																                GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
															
 
																                GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
															
 
																-               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
															
 
																-               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
															
 
																-               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
															
 
																+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 256);
															
 
																+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 256);
															
 
																+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 256);
															
 
																                GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
															
 
																                printf("\n");
															
--- a/socl/src/cl_createbuffer.c
+++ b/socl/src/cl_createbuffer.c
@@ -54,6 +54,8 @@ soclCreateBuffer(cl_context   context,
 
																                void *       host_ptr,
															
 
																                cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0
															
 
																 {
															
 
																+   cl_mem mem;
															
 
																+
															
 
																    if (errcode_ret != NULL)
															
 
																       *errcode_ret = CL_SUCCESS;
															
@@ -81,68 +83,61 @@ soclCreateBuffer(cl_context   context,
 
																       return NULL;
															
 
																    }
															
 
																-   {
															
 
																-      cl_mem mem;
															
 
																-      //Alloc cl_mem structure
															
 
																-      mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject);
															
 
																-      if (mem == NULL) {
															
 
																+   //Alloc cl_mem structure
															
 
																+   mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject);
															
 
																+   if (mem == NULL) {
															
 
																+      if (errcode_ret != NULL)
															
 
																+         *errcode_ret = CL_OUT_OF_HOST_MEMORY;
															
 
																+      return NULL;
															
 
																+   }
															
 
																+   
															
 
																+   mem->ptr = NULL;
															
 
																+   mem->map_count = 0;
															
 
																+   gc_entity_store(&mem->context, context);
															
 
																+   mem->flags = flags;
															
 
																+   mem->size = size;
															
 
																+   mem->host_ptr = host_ptr;
															
 
																+
															
 
																+   #ifdef DEBUG
															
 
																+   static int id = 0;
															
 
																+   mem->id = id++;
															
 
																+   #endif
															
 
																+
															
 
																+   mem_object_store(mem);
															
 
																+
															
 
																+   //TODO: we shouldn't allocate the buffer ourselves. StarPU allocates it if a NULL pointer is given
															
 
																+
															
 
																+   // If not MEM_USE_HOST_PTR, we need to alloc the buffer ourselves
															
 
																+   if (!(flags & CL_MEM_USE_HOST_PTR)) {
															
 
																+      mem->ptr = valloc(size);
															
 
																+      if (mem->ptr == NULL) {
															
 
																          if (errcode_ret != NULL)
															
 
																-            *errcode_ret = CL_OUT_OF_HOST_MEMORY;
															
 
																+            *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
															
 
																+         free(mem);
															
 
																          return NULL;
															
 
																       }
															
 
																-      
															
 
																-      mem->ptr = NULL;
															
 
																-      mem->map_count = 0;
															
 
																-      gc_entity_store(&mem->context, context);
															
 
																-      mem->flags = flags;
															
 
																-      mem->size = size;
															
 
																-      mem->host_ptr = host_ptr;
															
 
																-
															
 
																-      #ifdef DEBUG
															
 
																-      static int id = 0;
															
 
																-      mem->id = id++;
															
 
																-      #endif
															
 
																-
															
 
																-      mem_object_store(mem);
															
 
																-
															
 
																-      //TODO: we shouldn't allocate the buffer ourselves. StarPU allocates it if a NULL pointer is given
															
 
																-
															
 
																-      // If not MEM_USE_HOST_PTR, we need to alloc the buffer ourselves
															
 
																-      if (!(flags & CL_MEM_USE_HOST_PTR)) {
															
 
																-         mem->ptr = valloc(size);
															
 
																-         if (mem->ptr == NULL) {
															
 
																-            if (errcode_ret != NULL)
															
 
																-               *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
															
 
																-            free(mem);
															
 
																-            return NULL;
															
 
																-         }
															
 
																-         //The buffer doesn't contain meaningful data
															
 
																-         mem->scratch = 1;
															
 
																-      }
															
 
																-      else {
															
 
																-         //The buffer may contain meaningful data
															
 
																-         mem->scratch = 0;
															
 
																-         mem->ptr = host_ptr;
															
 
																-      }
															
 
																-
															
 
																-      // Access mode
															
 
																-      if (flags & CL_MEM_READ_ONLY)
															
 
																-         mem->mode = CL_MEM_READ_ONLY;
															
 
																-      else if (flags & CL_MEM_WRITE_ONLY)
															
 
																-         mem->mode = CL_MEM_WRITE_ONLY;
															
 
																-      else
															
 
																-         mem->mode = CL_MEM_READ_WRITE;
															
 
																-
															
 
																-      // Perform data copy if necessary
															
 
																-      if (flags & CL_MEM_COPY_HOST_PTR)
															
 
																-         memcpy(mem->ptr, host_ptr, size);
															
 
																-      
															
 
																-      // Create StarPU buffer (on home node? what's this?)
															
 
																-      starpu_variable_data_register(&mem->handle, 0, (uintptr_t)mem->ptr, size); 
															
 
																-
															
 
																-      DEBUG_MSG("[Buffer %d] Initialized (cl_mem %p handle %p)\n", mem->id, mem, mem->handle);
															
 
																-      
															
 
																-      return mem;
															
 
																+      //The buffer doesn't contain meaningful data
															
 
																+      mem->scratch = 1;
															
 
																+   }
															
 
																+   else {
															
 
																+      //The buffer may contain meaningful data
															
 
																+      mem->scratch = 0;
															
 
																+      mem->ptr = host_ptr;
															
 
																    }
															
 
																+
															
 
																+   // Access mode
															
 
																+   mem->mode = flags & CL_MEM_READ_ONLY  ? CL_MEM_READ_ONLY :
															
 
																+               flags & CL_MEM_WRITE_ONLY ? CL_MEM_WRITE_ONLY : CL_MEM_READ_WRITE;
															
 
																+
															
 
																+   // Perform data copy if necessary
															
 
																+   if (flags & CL_MEM_COPY_HOST_PTR)
															
 
																+      memcpy(mem->ptr, host_ptr, size);
															
 
																+   
															
 
																+   // Create StarPU buffer (on home node? what's this?)
															
 
																+   starpu_variable_data_register(&mem->handle, 0, (uintptr_t)mem->ptr, size); 
															
 
																+
															
 
																+   DEBUG_MSG("[Buffer %d] Initialized (cl_mem %p handle %p)\n", mem->id, mem, mem->handle);
															
 
																+   
															
 
																+   return mem;
															
 
																 }
															
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c