hace 13 años · 125960a47b
--- a/.gitignore
+++ b/.gitignore
@@ -286,5 +286,3 @@ starpu.log
 
				 /tools/starpu_workers_activity
			
 
				 /tests/datawizard/interfaces/copy_interfaces
			
 
				 /gcc-plugin/tests/release
			
 
				-/gcc-plugin/tests/opencl
			
 
				-/gcc-plugin/tests/registered
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,19 +1,16 @@
 
				 Cédric Augonnet <cedric.augonnet@inria.fr>
			
 
				 Nicolas Collin <nicolas.collin@inria.fr>
			
 
				 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
			
 
				-Nicolas Collin <nicolas.collin@inria.fr>
			
 
				 Nathalie Furmento <nathalie.furmento@labri.fr>
			
 
				 Sylvain Henry <sylvain.henry@inria.fr>
			
 
				-Cyril Roelandt <cyril.roelandt@inria.fr>
			
 
				+Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				 François Tessier <francois.tessier@inria.fr>
			
 
				 Samuel Thibault <samuel.thibault@labri.fr>
			
 
				-Pierre André Wacrenier <wacrenier@labri.fr>
			
 
				 William Braik <wbraik@gmail.com>
			
 
				 Yann Courtois <yann.courtois33@gmail.com>
			
 
				 Jean-Marie Couteyen <jm.couteyen@gmail.com>
			
 
				-Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				 Anthony Roy <theanthony33@gmail.com>
			
 
				 David Gómez <david_gomez1380@yahoo.com.mx>
			
 
				 Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
			
 
				 Antoine Lucas <antoine.lucas.33@gmail.com>
			
 
				-
			
 
				+Pierre André Wacrenier <wacrenier@labri.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,62 +14,6 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-StarPU 1.1.0 (svn revision xxxx)
			
 
				-==============================================
			
 
				-
			
 
				-New features:
			
 
				-  * OpenGL interoperability support.
			
 
				-  * Capability to store compiled OpenCL kernels on the file system
			
 
				-  * Capability to load compiled OpenCL kernels
			
 
				-  * Performance models measurements can now be provided explicitly by
			
 
				-    applications.
			
 
				-  * Capability to emit communication statistics when running MPI code
			
 
				-  * Add starpu_unregister_submit, starpu_data_acquire_on_node and
			
 
				-    starpu_data_invalidate_submit
			
 
				-  * New functionnality to wrapper starpu_insert_task to pass a array of
			
 
				-	data_handles via the parameter STARPU_DATA_ARRAY
			
 
				-  * Enable GPU-GPU direct transfers.
			
 
				-  * GCC plug-in
			
 
				-	- Add `registered' attribute
			
 
				-
			
 
				-Changes:
			
 
				-  * The FxT code can now be used on systems other than Linux.
			
 
				-  * Keep only one hashtable implementation common/uthash.h
			
 
				-  * The cache of starpu_mpi_insert_task is fixed and thus now enabled by
			
 
				-    default.
			
 
				-  * Standardize objects name in the performance model API
			
 
				-
			
 
				-Small changes:
			
 
				-  * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
			
 
				-	still available for compatibility reasons.
			
 
				-  * include/starpu.h includes all include/starpu_*.h files, applications
			
 
				-	therefore only need to have #include <starpu.h>
			
 
				-
			
 
				-StarPU 1.0.2 (svn revision xxx)
			
 
				-==============================================
			
 
				-
			
 
				-Changes:
			
 
				-  * Add starpu_block_shadow_filter_func_vector and an example.
			
 
				-  * Add tag dependency in trace-generated DAG.
			
 
				-  * Fix CPU binding for optimized CPU-GPU transfers.
			
 
				-  * Fix parallel tasks CPU binding and combined worker generation.
			
 
				-  * Fix generating FXT traces bigger than 64MiB.
			
 
				-
			
 
				-StarPU 1.0.1 (svn revision 6659)
			
 
				-==============================================
			
 
				-
			
 
				-Changes:
			
 
				-  * hwloc support. Warn users when hwloc is not found on the system and
			
 
				-	produce error when not explicitely disabled.
			
 
				-  * Several bug fixes
			
 
				-  * GCC plug-in
			
 
				-	- Add `#pragma starpu release'
			
 
				-	- Fix bug when using `acquire' pragma with function parameters
			
 
				-	- Slightly improve test suite coverage
			
 
				-	- Relax the GCC version check
			
 
				-  * Update SOCL to use new API
			
 
				-  * Documentation improvement.
			
 
				-
			
 
				 StarPU 1.0.0 (svn revision 6306)
			
 
				 ==============================================
			
 
				 The extensions-again release
			
@@ -269,9 +213,3 @@ Changes:
 
				    - transparent data coherency management
			
 
				    - High-level expressive interface
			
 
				 
			
 
				-
			
 
				-# Local Variables:
			
 
				-# mode: text
			
 
				-# coding: utf-8
			
 
				-# ispell-local-dictionary: "american"
			
 
				-# End:
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -28,9 +28,7 @@ if BUILD_SOCL
 
				 SUBDIRS += socl
			
 
				 endif
			
 
				 
			
 
				-if BUILD_EXAMPLES
			
 
				 SUBDIRS += examples
			
 
				-endif
			
 
				 
			
 
				 if BUILD_GCC_PLUGIN
			
 
				 SUBDIRS += gcc-plugin
			
@@ -55,7 +53,6 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_task.h			\
			
 
				 	include/starpu_task_bundle.h		\
			
 
				 	include/starpu_task_list.h		\
			
 
				-	include/starpu_task_util.h		\
			
 
				 	include/starpu_data.h			\
			
 
				 	include/starpu_perfmodel.h		\
			
 
				 	include/starpu_util.h			\
			
@@ -69,8 +66,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_top.h			\
			
 
				 	include/starpu_deprecated_api.h         \
			
 
				 	include/starpu_hash.h			\
			
 
				-	include/starpu_rand.h			\
			
 
				-	include/starpu_cublas.h
			
 
				+	include/starpu_rand.h
			
 
				 
			
 
				 nodist_versinclude_HEADERS = 			\
			
 
				 	include/starpu_config.h
			
@@ -109,7 +105,7 @@ else
 
				 txtdir = ${docdir}
			
 
				 endif
			
 
				 txt_DATA = AUTHORS COPYING.LGPL README
			
 
				-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
			
 
				+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION
			
 
				 
			
 
				 include starpu-top/extradist
			
 
				 
			
--- a/STARPU-VERSION
+++ b/STARPU-VERSION
@@ -2,21 +2,6 @@
 
				 
			
 
				 # Versioning (SONAMEs) for StarPU libraries.
			
 
				 
			
 
				-# http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html#Updating-version-info
			
 
				-# Here are a set of rules to help you update your library version information:
			
 
				-# Start with version information of ‘0:0:0’ for each libtool library.
			
 
				-# Update the version information only immediately before a public
			
 
				-# release of your software. More frequent updates are unnecessary, and
			
 
				-# only guarantee that the current interface number gets larger faster.
			
 
				-# - If the library source code has changed at all since the last
			
 
				-#   update, then increment revision (‘c:r:a’ becomes ‘c:r+1:a’).
			
 
				-# - If any interfaces have been added, removed, or changed since the
			
 
				-#   last update, increment current, and set revision to 0.
			
 
				-# - If any interfaces have been added since the last public release,
			
 
				-#   then increment age.
			
 
				-# - If any interfaces have been removed or changed since the last
			
 
				-#   public release, then set age to 0. change
			
 
				-
			
 
				 # Libtool interface versioning (info "(libtool) Versioning").
			
 
				 LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
			
 
				 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
			
--- a/configure.ac
+++ b/configure.ac
@@ -16,7 +16,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AC_INIT([StarPU],1.1.0, [starpu-devel@lists.gforge.inria.fr], starpu)
			
 
				+AC_INIT([StarPU],1.0.0, [starpu-devel@lists.gforge.inria.fr], starpu)
			
 
				 AC_CONFIG_SRCDIR(include/starpu.h)
			
 
				 AC_CONFIG_AUX_DIR([build-aux])
			
 
				 
			
@@ -59,9 +59,7 @@ m4_ifdef([AM_SILENT_RULES],
 
				 
			
 
				 AC_PREREQ(2.60)
			
 
				 
			
 
				-m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
			
 
				 AC_PROG_CC
			
 
				-AM_PROG_CC_C_O
			
 
				 AC_PROG_CXX
			
 
				 AC_PROG_CPP
			
 
				 AC_PROG_SED
			
@@ -134,9 +132,9 @@ else
 
				 fi
			
 
				 
			
 
				 AC_COMPILE_IFELSE(
			
 
				-  [AC_LANG_PROGRAM([[
			
 
				+  AC_LANG_PROGRAM([[
			
 
				     #include <pthread.h>
			
 
				-  ]], [[ pthread_t t; pthread_create(&t, NULL, NULL, NULL); ]])],,
			
 
				+  ]], [[ pthread_t t; pthread_create(&t, NULL, NULL, NULL); ]]),,
			
 
				   AC_MSG_ERROR([pthread_create unavailable]))
			
 
				 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
			
 
				 AC_HAVE_LIBRARY([ws2_32])
			
@@ -171,18 +169,9 @@ fi
 
				 # Some systems do not define strerror_r
			
 
				 AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
			
 
				 
			
 
				-# Some systems may not define setenv
			
 
				-AC_CHECK_FUNC([setenv], [AC_DEFINE([STARPU_HAVE_SETENV], [1], [Define to 1 if the function setenv is available.])])
			
 
				-
			
 
				 # Some systems do not define unsetenv
			
 
				 AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
			
 
				 
			
 
				-# Some systems do not define nearbyintf...
			
 
				-AC_CHECK_FUNC([nearbyintf], [AC_DEFINE([STARPU_HAVE_NEARBYINTF], [1], [Define to 1 if the function nearbyintf is available.])])
			
 
				-
			
 
				-# ... but they may define rintf.
			
 
				-AC_CHECK_FUNC([rintf], [AC_DEFINE([STARPU_HAVE_RINTF], [1], [Define to 1 if the function rintf is available.])])
			
 
				-
			
 
				 # Define slow machine
			
 
				 AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--enable-slow-machine],
			
 
				 				   [Lower default values for the testcases run by make check])],
			
@@ -271,6 +260,7 @@ AC_ARG_ENABLE(cpu, [AS_HELP_STRING([--disable-cpu],
 
				 			enable_cpu=$enableval, enable_cpu=yes)
			
 
				 AC_MSG_RESULT($enable_cpu)
			
 
				 AC_SUBST(STARPU_USE_CPU, $enable_cpu)
			
 
				+
			
 
				 AM_CONDITIONAL(STARPU_USE_CPU, test x$enable_cpu = xyes)
			
 
				 
			
 
				 if test x$enable_cpu = xyes; then
			
@@ -439,30 +429,14 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 
				     if test "$have_valid_cuda" = "yes" ; then
			
 
				         SAVED_CPPFLAGS="${CPPFLAGS}"
			
 
				         CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
			
 
				-	SAVED_LDFLAGS="${LDFLAGS}"
			
 
				-	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS} -lcuda"
			
 
				 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
			
 
				 		[[#include <cuda.h>]],
			
 
				 		[[]]
			
 
				-		)],
			
 
				-	    [
			
 
				-	      AC_RUN_IFELSE([AC_LANG_PROGRAM(
			
 
				-	        [[#include <cuda.h>]],
			
 
				-		[[]]
			
 
				-		)],
			
 
				-		[have_valid_cuda="yes"],
			
 
				-		[
			
 
				-	          AC_MSG_RESULT([CUDA found and can be compiled, but compiled application can not be run, is the CUDA path missing in LD_LIBRARY_PATH?])
			
 
				-	          have_valid_cuda="no"
			
 
				-		])
			
 
				-	    ],
			
 
				-	    [
			
 
				-	    AC_MSG_ERROR([CUDA found, but cuda.h could not be compiled])
			
 
				-	    have_valid_cuda="no"
			
 
				-	    ]
			
 
				-	)
			
 
				+		),
			
 
				+	    [have_valid_cuda="yes"],
			
 
				+	    [have_valid_cuda="no"]
			
 
				+	])
			
 
				         CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				-	LDFLAGS="${SAVED_LDFLAGS}"
			
 
				     fi
			
 
				     AC_MSG_RESULT($have_valid_cuda)
			
 
				 
			
@@ -508,8 +482,6 @@ if test x$enable_cuda = xyes; then
 
				 		NVCCFLAGS="${NVCCFLAGS} -m64"
			
 
				 		AC_SUBST(NVCCFLAGS)
			
 
				 	fi
			
 
				-
			
 
				-	AC_CHECK_HEADERS([cuda_gl_interop.h])
			
 
				 fi
			
 
				 
			
 
				 have_magma=no
			
@@ -732,51 +704,6 @@ fi
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-# General GPU settings                                                        #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-AC_MSG_CHECKING(whether asynchronous copy should be disabled)
			
 
				-AC_ARG_ENABLE(asynchronous-copy, [AS_HELP_STRING([--disable-asynchronous-copy],
			
 
				-			[disable asynchronous copy between CPU and GPU])],
			
 
				-			enable_asynchronous_copy=$enableval, enable_asynchronous_copy=yes)
			
 
				-disable_asynchronous_copy=no
			
 
				-if test x$enable_asynchronous_copy = xno ; then
			
 
				-   disable_asynchronous_copy=yes
			
 
				-fi
			
 
				-AC_MSG_RESULT($disable_asynchronous_copy)
			
 
				-if test x$disable_asynchronous_copy = xyes ; then
			
 
				-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and GPU devices])
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(whether asynchronous CUDA copy should be disabled)
			
 
				-AC_ARG_ENABLE(asynchronous-cudacopy, [AS_HELP_STRING([--disable-asynchronous-cuda-copy],
			
 
				-			[disable asynchronous copy between CPU and CUDA devices])],
			
 
				-			enable_asynchronous_cuda_copy=$enableval, enable_asynchronous_cuda_copy=yes)
			
 
				-disable_asynchronous_cuda_copy=no
			
 
				-if test x$enable_asynchronous_cuda_copy = xno ; then
			
 
				-   disable_asynchronous_cuda_copy=yes
			
 
				-fi
			
 
				-AC_MSG_RESULT($disable_asynchronous_cuda_copy)
			
 
				-if test x$disable_asynchronous_cuda_copy = xyes ; then
			
 
				-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and CUDA devices])
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(whether asynchronous OpenCL copy should be disabled)
			
 
				-AC_ARG_ENABLE(asynchronous-openclcopy, [AS_HELP_STRING([--disable-asynchronous-opencl-copy],
			
 
				-			[disable asynchronous copy between CPU and OPENCL devices])],
			
 
				-			enable_asynchronous_opencl_copy=$enableval, enable_asynchronous_opencl_copy=yes)
			
 
				-disable_asynchronous_opencl_copy=no
			
 
				-if test x$enable_asynchronous_opencl_copy = xno ; then
			
 
				-   disable_asynchronous_opencl_copy=yes
			
 
				-fi
			
 
				-AC_MSG_RESULT($disable_asynchronous_opencl_copy)
			
 
				-if test x$disable_asynchronous_opencl_copy = xyes ; then
			
 
				-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
			
 
				-fi
			
 
				-
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				 #                                 Cell settings                               #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
@@ -823,7 +750,6 @@ if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
 
				 
			
 
				 	# now we enable Gordon if and only if a proper setup is available
			
 
				 	enable_gordon=$have_valid_gordon
			
 
				-	AC_DEFINE(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether GORDON should be used)
			
@@ -862,7 +788,7 @@ if test x$enable_debug = xyes; then
 
				 	CFLAGS="$CFLAGS -O0"
			
 
				 	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
			
 
				 else
			
 
				-	CFLAGS="-O3 $CFLAGS"
			
 
				+	CFLAGS="$CFLAGS -O3"
			
 
				 fi
			
 
				 CFLAGS+=" -gdwarf-2 -g3 "
			
 
				 
			
@@ -942,26 +868,14 @@ if test x$use_fxt = xyes; then
 
				 	AC_DEFINE(CONFIG_FUT, [1], [enable FUT traces])
			
 
				 
			
 
				 	if test x$use_fxt_from_system = xno; then
			
 
				-		FXT_CFLAGS="-I$fxtdir/include/ "
			
 
				-		FXT_LDFLAGS="-L$fxtdir/lib/"
			
 
				-		AC_ARG_VAR(FXT_LDFLAGS)
			
 
				-		FXT_LIBS="-lfxt"
			
 
				+		CPPFLAGS="${CPPFLAGS} -I$fxtdir/include/ "
			
 
				+		LDFLAGS="${LDFLAGS} -L$fxtdir/lib/ -lfxt"
			
 
				 	else
			
 
				 	    PKG_CHECK_MODULES([FXT],  [fxt])
			
 
				 	fi
			
 
				-	save_LIBS="$LIBS"
			
 
				-	LIBS="$LIBS $FXT_LIBS"
			
 
				-	save_LDFLAGS="$LDFLAGS"
			
 
				-	LDFLAGS="$LDFLAGS $FXT_LDFLAGS"
			
 
				-   	AC_CHECK_FUNCS([enable_fut_flush])
			
 
				-   	AC_CHECK_FUNCS([fut_set_filename])
			
 
				-	LDFLAGS="$save_LDFLAGS"
			
 
				-	LIBS="$save_LIBS"
			
 
				-	save_CFLAGS="$CFLAGS"
			
 
				-	CFLAGS="$CFLAGS $FXT_CFLAGS"
			
 
				-	AC_CHECK_DECLS([enable_fut_flush])
			
 
				-	AC_CHECK_DECLS([fut_set_filename])
			
 
				-	CFLAGS="$save_CFLAGS"
			
 
				+
			
 
				+	# if we use monotonic clocks, FxT uses -lrt
			
 
				+	AC_CHECK_LIB(rt, clock_gettime,,AC_MSG_ERROR([cannot find clock_gettime]))
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether performance debugging should be enabled)
			
@@ -1081,7 +995,7 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of worker
 
				 AC_MSG_CHECKING(maximum number of implementations)
			
 
				 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
			
 
				 		[maximum number of implementations])],
			
 
				-		maximplementations=$enableval, maximplementations=8)
			
 
				+		maximplementations=$enableval, maximplementations=4)
			
 
				 AC_MSG_RESULT($maximplementations)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
			
 
				 		[maximum number of implementations])
			
@@ -1120,12 +1034,6 @@ AC_MSG_RESULT($mpicc_path)
 
				 AC_SUBST(MPICC, $mpicc_path)
			
 
				 if test x$use_mpi = xyes; then
			
 
				 	cc_or_mpicc=$mpicc_path
			
 
				-        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				-        # libstarpumpi.
			
 
				-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				-        # references to MPI_*). We manually add the required flags to fix this
			
 
				-        # issue.
			
 
				-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				 else
			
 
				 	cc_or_mpicc=$CC
			
 
				 fi
			
@@ -1185,18 +1093,6 @@ if test x$use_mpi = xyes; then
 
				 	AC_DEFINE(STARPU_USE_MPI,[],[whether the StarPU MPI library is available])
			
 
				 fi
			
 
				 
			
 
				-AC_MSG_CHECKING(whether communication statistics should be generated)
			
 
				-AC_ARG_ENABLE(comm-stats, [AS_HELP_STRING([--enable-comm-stats],
			
 
				-			[enable communication statistics (only valid with the StarPU MPI library])],
			
 
				-			enable_comm_stats=$enableval, enable_comm_stats=no)
			
 
				-AC_MSG_RESULT($enable_comm_stats)
			
 
				-AC_SUBST(STATS, $enable_comm_stats)
			
 
				-AC_SUBST(STARPU_COMM_STATS, $enable_comm_stats)
			
 
				-
			
 
				-if test x$enable_comm_stats = xyes; then
			
 
				-        AC_DEFINE(STARPU_COMM_STATS, [1], [enable communication statistics])
			
 
				-fi
			
 
				-
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                               StarPU-Top                                    #
			
@@ -1206,13 +1102,13 @@ fi
 
				 AC_ARG_ENABLE([starpu-top],
			
 
				   [AS_HELP_STRING([--disable-starpu-top],
			
 
				     [build StarPU-Top])],
			
 
				-  [enable_starpu_top="$enableval"],
			
 
				+  [enable_starpu_top="no"],
			
 
				   [enable_starpu_top="maybe"])
			
 
				 
			
 
				 # Check whether StarPU-Top can be built
			
 
				 AC_MSG_CHECKING(for StarPU-Top)
			
 
				 
			
 
				-if test "x$enable_starpu_top" != "xno" ; then
			
 
				+if test "x$enable_starpu_top" = "xmaybe" ; then
			
 
				 	can_build_starpu_top=no
			
 
				 	AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
			
 
				 	if test x$QMAKE != xnot-found; then
			
@@ -1254,7 +1150,7 @@ if test "x$enable_starpu_top" != "xno" ; then
 
				 	fi
			
 
				 fi
			
 
				 
			
 
				-if test "x$enable_starpu_top" != "xno" ; then
			
 
				+if test "x$enable_starpu_top" = "xmaybe" ; then
			
 
				   build_starpu_top=$can_build_starpu_top
			
 
				 else
			
 
				   build_starpu_top=no
			
@@ -1378,8 +1274,7 @@ fi
 
				 AM_MISSING_PROG([YACC], [bison])
			
 
				 
			
 
				 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
			
 
				-AM_CONDITIONAL([RUN_GCC_PLUGIN_TESTS],
			
 
				-  [test "x$run_gcc_plugin_test_suite" = "xyes"])
			
 
				+AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
@@ -1435,11 +1330,6 @@ fi
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				-AC_ARG_ENABLE(build-examples, [AS_HELP_STRING([--disable-build-examples],
			
 
				-			[disable building of examples])],
			
 
				-			enable_build_examples=$enableval, enable_build_examples=yes)
			
 
				-# check stuff for examples (todo)
			
 
				-AM_CONDITIONAL(BUILD_EXAMPLES, [test x$enable_build_examples != xno])
			
 
				 AC_ARG_ENABLE(opengl-render, [AS_HELP_STRING([--enable-opengl-render],
			
 
				 			[enable OpenGL rendering of some examples])],
			
 
				 			enable_opengl_render=$enableval, enable_opengl_render=no)
			
@@ -1455,7 +1345,6 @@ fi
 
				 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
			
 
				 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
			
 
				 AC_MSG_RESULT($enable_opengl_render)
			
 
				-AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
			
 
				 
			
 
				 AC_PATH_XTRA
			
 
				 if test "x$no_x" != "xyes"; then
			
@@ -1563,12 +1452,6 @@ if test x$blas_lib = xmaybe; then
 
				      if test x$use_system_blas = xyes; then
			
 
				         AC_DEFINE(STARPU_SYSTEM_BLAS, [1], [use refblas library])
			
 
				 	blas_lib=system
			
 
				-     elif test x"$BLAS_LIBS" != x; then
			
 
				-        AC_DEFINE(STARPU_SYSTEM_BLAS, [1], [use user defined library])
			
 
				-        STARPU_BLAS_LDFLAGS="$BLAS_LIBS"
			
 
				-        AC_SUBST(STARPU_BLAS_LDFLAGS)
			
 
				-        blas_lib=system
			
 
				-        AC_ARG_VAR([BLAS_LIBS], [linker flags for blas])
			
 
				      else
			
 
				 	blas_lib=none
			
 
				      fi
			
@@ -1627,10 +1510,7 @@ AM_CONDITIONAL(BUILD_STARPUFFT, [test x$fft_support = xyes])
 
				 # hwloc                                  #
			
 
				 ##########################################
			
 
				 
			
 
				-AC_ARG_WITH([hwloc],
			
 
				-	[AS_HELP_STRING([--without-hwloc],
			
 
				-	[Disable hwloc (enabled by default)])],
			
 
				-	[hwloc_dir="$withval"])
			
 
				+AC_ARG_WITH([hwloc], [AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])])
			
 
				 SAVED_LDFLAGS="${LDFLAGS}"
			
 
				 SAVED_CPPFLAGS="${CPPFLAGS}"
			
 
				 AS_IF([test "x$with_hwloc" != "xno"], [
			
@@ -1640,9 +1520,10 @@ AS_IF([test "x$with_hwloc" != "xno"], [
 
				 		have_valid_hwloc=no
			
 
				 		have_pkgconfig_hwloc=no])
			
 
				 	AS_IF([test "$have_valid_hwloc" = "no"], [
			
 
				-		if test "$hwloc_dir" != "" ; then CPPFLAGS="${SAVED_CPPFLAGS} -I$hwloc_dir/include" ; fi
			
 
				+		hwloc_dir="$withval"
			
 
				+		CPPFLAGS="${SAVED_CPPFLAGS} -I$hwloc_dir/include"
			
 
				 		AC_CHECK_HEADER([hwloc.h],[have_valid_hwloc=yes],[have_valid_hwloc=no])
			
 
				-		if test "$hwloc_dir" != "" ; then LDFLAGS="${SAVED_LDFLAGS} -L$hwloc_dir/lib" ; fi
			
 
				+		LDFLAGS="${SAVED_LDFLAGS} -L$hwloc_dir/lib"
			
 
				 		AC_HAVE_LIBRARY([hwloc],[have_valid_hwloc=yes],[have_valid_hwloc=no])
			
 
				 		])
			
 
				     ],
			
@@ -1657,10 +1538,6 @@ AS_IF([test "$have_valid_hwloc" = "yes"], [
 
				 	])
			
 
				 LDFLAGS="${SAVED_LDFLAGS}"
			
 
				 CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				-
			
 
				-if test "$have_valid_hwloc" = "no" -a "$hwloc_dir" != "no" ; then
			
 
				-   AC_MSG_ERROR([hwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install hwloc. However, if you really want to use StarPU without enabling hwloc, please restart configure by specifying the option '--without-hwloc'.])
			
 
				-fi
			
 
				 AC_MSG_CHECKING(whether hwloc should be used)
			
 
				 AC_MSG_RESULT($have_valid_hwloc)
			
 
				 AC_SUBST(HWLOC_REQUIRES)
			
@@ -1687,11 +1564,11 @@ if test "$enable_cuda" = "yes" -a "$ICC" != ""; then
 
				    OLD_CFLAGS="$CFLAGS"
			
 
				    CFLAGS="-I$PWD/include -I$srcdir/include"
			
 
				    AC_COMPILE_IFELSE(
			
 
				-       [AC_LANG_PROGRAM(
			
 
				+       AC_LANG_PROGRAM(
			
 
				 	   [[#include <cuda.h>
			
 
				 	   #include <starpu.h>]],
			
 
				 	   [[]]
			
 
				-	   )],
			
 
				+	   ),
			
 
				        AC_MSG_RESULT(yes),
			
 
				        [ICC=""
			
 
				            AC_MSG_RESULT(no)]
			
@@ -1725,18 +1602,6 @@ AC_CHECK_MEMBER([struct cudaDeviceProp.pciBusID],
 
				   AC_DEFINE([STARPU_HAVE_BUSID],[1],[Define to 1 if CUDA device properties include BusID]),
			
 
				   , [[#include <cuda_runtime_api.h>]])
			
 
				 
			
 
				-dnl Set this condition when Automake 1.11 or later is being used.
			
 
				-dnl Automake 1.11 introduced `silent-rules', hence the check.
			
 
				-m4_ifdef([AM_SILENT_RULES],
			
 
				-  AM_CONDITIONAL([STARPU_HAVE_AM111], [true]),
			
 
				-  AM_CONDITIONAL([STARPU_HAVE_AM111], [false]))
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				-#                                Final settings                               #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-
			
 
				 # File configuration
			
 
				 AC_CONFIG_COMMANDS([executable-scripts], [
			
 
				   chmod +x tests/regression/regression.sh
			
@@ -1816,11 +1681,3 @@ AC_MSG_NOTICE([
 
				 	       SOCL enabled:  $build_socl
			
 
				                Scheduler Hypervisor: $build_sched_ctx_hypervisor
			
 
				 ])
			
 
				-
			
 
				-if test x"$have_valid_hwloc" = xno
			
 
				-then
			
 
				-  AC_MSG_NOTICE([
			
 
				-WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
			
 
				-performance may be impacted a lot.  It is strongly recommended to install
			
 
				-hwloc])
			
 
				-fi
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -34,33 +34,18 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 
				 	chapters/using.texi \
			
 
				 	chapters/vector_scal_opencl.texi \
			
 
				 	chapters/socl.texi \
			
 
				-	chapters/sched_ctx_hypervisor.texi \
			
 
				-	chapters/version.texi
			
 
				+	chapters/sched_ctx_hypervisor.texi
			
 
				 
			
 
				 MAINTAINERCLEANFILES = starpu.pdf
			
 
				 
			
 
				-EXTRA_DIST = starpu.css
			
 
				-
			
 
				-dist_pdf_DATA = starpu.pdf
			
 
				+EXTRA_DIST = starpu.pdf \
			
 
				+	starpu.css
			
 
				 
			
 
				 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
			
 
				 
			
 
				 uninstall-local:
			
 
				 	$(RM) $(DESTDIR)$(infodir)/dir
			
 
				 
			
 
				-#TODO: when stat is not available on the machine, insert "unknown date"
			
 
				-chapters/version.texi:
			
 
				-	@for f in $(starpu_TEXINFOS) ; do \
			
 
				-                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f ; fi \
			
 
				-        done | sort -r | head -1 > timestamp
			
 
				-	@LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated
			
 
				-	@LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month
			
 
				-	@echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi
			
 
				-	@echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi
			
 
				-	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
			
 
				-	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
			
 
				-	@$(RM) timestamp timestamp_updated timestamp_updated_month
			
 
				-
			
 
				 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
			
 
				 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
			
 
				 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
			
@@ -75,7 +60,7 @@ chapters/version.texi:
 
				 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
			
 
				 
			
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				-PUBLISHHOST	?= sync
			
 
				+PUBLISHHOST	:= sync
			
 
				 update-web: starpu.html
			
 
				 	sed -i 's/gcc\.html#Attribute-Syntax/http:\/\/gcc.gnu.org\/onlinedocs\/gcc\/Attribute-Syntax.html#Attribute-Syntax/' starpu.html
			
 
				 	scp starpu.pdf starpu.html $(PUBLISHHOST):/web/runtime/html/StarPU
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -11,10 +11,9 @@
 
				 * Multiformat Data Interface::  
			
 
				 * Task Bundles::                
			
 
				 * Task Lists::                  
			
 
				-* Using Parallel Tasks::        
			
 
				-* Scheduling Contexts::         
			
 
				+* Using Parallel Tasks::       
			
 
				+* Scheduling Contexts::
			
 
				 * Defining a new scheduling policy::  
			
 
				-* Running drivers::             
			
 
				 * Expert mode::                 
			
 
				 @end menu
			
 
				 
			
@@ -70,18 +69,6 @@ An identifier that is unique to each interface.
 
				 @item @code{size_t interface_size}
			
 
				 The size of the interface data descriptor.
			
 
				 
			
 
				-@item @code{int is_multiformat}
			
 
				-todo
			
 
				-
			
 
				-@item @code{struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface)}
			
 
				-todo
			
 
				-
			
 
				-@item @code{int (*pack_data)(starpu_data_handle_t handle, uint32_t node, void **ptr)}
			
 
				-Pack the data handle into a contiguous buffer at the address @code{ptr}
			
 
				-
			
 
				-@item @code{int (*unpack_data)(starpu_data_handle_t handle, uint32_t node, void *ptr)}
			
 
				-Unpack the data handle from the contiguous buffer at the address @code{ptr}
			
 
				-
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
@@ -173,7 +160,7 @@ struct starpu_complex_interface
 
				 @end cartouche
			
 
				 
			
 
				 Registering such a data to StarPU is easily done using the function
			
 
				-@code{starpu_data_register} (@pxref{Basic Data Management API}). The last
			
 
				+@code{starpu_data_register} (@pxref{Basic Data Library API}). The last
			
 
				 parameter of the function, @code{interface_complex_ops}, will be
			
 
				 described below.
			
 
				 
			
@@ -371,17 +358,6 @@ This function mustn't be called if @var{bundle} is already closed and/or @var{ta
 
				 Inform the runtime that the user won't modify @var{bundle} anymore, it means no more inserting or removing task. Thus the runtime can destroy it when possible.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun double starpu_task_bundle_expected_length (starpu_task_bundle_t @var{bundle}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
			
 
				-Return the expected duration of the entire task bundle in µs.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun double starpu_task_bundle_expected_power (starpu_task_bundle_t @var{bundle}, enum starpu_perf_archtype @var{arch}, unsigned @var{nimpl})
			
 
				-Return the expected power consumption of the entire task bundle in J.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun double starpu_task_bundle_expected_data_transfer_time (starpu_task_bundle_t @var{bundle}, unsigned @var{memory_node})
			
 
				-Return the time (in µs) expected to transfer all data used within the bundle.
			
 
				-@end deftypefun
			
 
				 
			
 
				 @node Task Lists
			
 
				 @section Task Lists
			
@@ -816,73 +792,6 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-@node Running drivers
			
 
				-@section Running drivers
			
 
				-
			
 
				-@menu
			
 
				-* Driver API::
			
 
				-* Running drivers Example::
			
 
				-@end menu
			
 
				-
			
 
				-@node Driver API
			
 
				-@subsection Driver API
			
 
				-
			
 
				-@deftypefun int starpu_driver_run ({struct starpu_driver *}@var{d})
			
 
				-Initialize the given driver, run it until it receives a request to terminate,
			
 
				-deinitialize it and return 0 on success. It returns -EINVAL if @code{d->type}
			
 
				-is not a valid StarPU device type (STARPU_CPU_WORKER, STARPU_CUDA_WORKER or
			
 
				-STARPU_OPENCL_WORKER). This is the same as using the following
			
 
				-functions: calling @code{starpu_driver_init()}, then calling
			
 
				-@code{starpu_driver_run_once()} in a loop, and eventually
			
 
				-@code{starpu_driver_deinit()}.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
			
 
				-Initialize the given driver. Returns 0 on success, -EINVAL if
			
 
				-@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
			
 
				-STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_driver_run_once (struct starpu_driver *@var{d})
			
 
				-Run the driver once, then returns 0 on success, -EINVAL if
			
 
				-@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
			
 
				-STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_driver_deinit (struct starpu_driver *@var{d})
			
 
				-Deinitialize the given driver. Returns 0 on success, -EINVAL if
			
 
				-@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
			
 
				-STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_drivers_request_termination (void)
			
 
				-Notify all running drivers they should terminate.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@node Running drivers Example
			
 
				-@subsection Example
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-int ret;
			
 
				-struct starpu_driver = @{
			
 
				-    .type = STARPU_CUDA_WORKER,
			
 
				-    .id.cuda_id = 0
			
 
				-@};
			
 
				-ret = starpu_driver_init(&d);
			
 
				-if (ret != 0)
			
 
				-    error();
			
 
				-while (some_condition) @{
			
 
				-    ret = starpu_driver_run_once(&d);
			
 
				-    if (ret != 0)
			
 
				-        error();
			
 
				-@}
			
 
				-ret = starpu_driver_deinit(&d);
			
 
				-if (ret != 0)
			
 
				-    error();
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				 @node Expert mode
			
 
				 @section Expert mode
			
 
				 
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -15,7 +15,6 @@
 
				 * Theoretical lower bound on execution time::  
			
 
				 * Insert Task Utility::          
			
 
				 * Data reduction::  
			
 
				-* Temporary buffers::  
			
 
				 * Parallel Tasks::
			
 
				 * Debugging::
			
 
				 * The multiformat interface::
			
@@ -236,7 +235,7 @@ starpu_data_partition(handle, &f);
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-The task submission then uses @code{starpu_data_get_sub_data} to retrieve the
			
 
				+The task submission then uses @code{starpu_data_get_sub_data} to retrive the
			
 
				 sub-handles to be passed as tasks parameters.
			
 
				 
			
 
				 @cartouche
			
@@ -294,10 +293,6 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-StarPU provides various interfaces and filters for matrices, vectors, etc.,
			
 
				-but applications can also write their own data interfaces and filters, see
			
 
				-@code{examples/interface} and @code{examples/filters/custom_mf} for an example.
			
 
				-
			
 
				 @node Performance model example
			
 
				 @section Performance model example
			
 
				 
			
@@ -307,9 +302,7 @@ a performance model, by defining a @code{starpu_perfmodel} structure and
 
				 providing its address in the @code{model} field of the @code{struct starpu_codelet}
			
 
				 structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
			
 
				 are mandatory, to give a name to the model, and the type of the model, since
			
 
				-there are several kinds of performance models. For compatibility, make sure to
			
 
				-initialize the whole structure to zero, either by using explicit memset, or by
			
 
				-letting the compiler implicitly do it as examplified below.
			
 
				+there are several kinds of performance models.
			
 
				 
			
 
				 @itemize
			
 
				 @item
			
@@ -324,13 +317,9 @@ and ouput sizes as an index.
 
				 It will also save it in @code{~/.starpu/sampling/codelets}
			
 
				 for further executions, and can be observed by using the
			
 
				 @code{starpu_perfmodel_display} command, or drawn by using
			
 
				-the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  The
			
 
				-models are indexed by machine name. To
			
 
				+the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
			
 
				 share the models between machines (e.g. for a homogeneous cluster), use
			
 
				-@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done
			
 
				-when using a task scheduler which makes use of it, such as @code{heft} or
			
 
				-@code{dmda}. Measurements can also be provided explicitly by the application, by
			
 
				-using the @code{starpu_perfmodel_update_history} function.
			
 
				+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
			
 
				 
			
 
				 The following is a small code example.
			
 
				 
			
@@ -360,37 +349,19 @@ struct starpu_codelet cl = @{
 
				 
			
 
				 @item
			
 
				 Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
			
 
				-model type). This still assumes performance regularity, but works
			
 
				+model type). This still assumes performance regularity, but can work
			
 
				 with various data input sizes, by applying regression over observed
			
 
				 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				 form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
 
				-STARPU_REGRESSION_BASED, but costs a lot more to compute).
			
 
				-
			
 
				-For instance,
			
 
				+STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
			
 
				 @code{tests/perfmodels/regression_based.c} uses a regression-based performance
			
 
				-model for the @code{memset} operation.
			
 
				-
			
 
				-Of course, the application has to issue
			
 
				+model for the @code{memset} operation. Of course, the application has to issue
			
 
				 tasks with varying size so that the regression can be computed. StarPU will not
			
 
				 trust the regression unless there is at least 10% difference between the minimum
			
 
				-and maximum observed input size. It can be useful to set the
			
 
				-@code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
			
 
				-on varying input sizes, so as to feed the performance model for a variety of
			
 
				-inputs, or to provide the measurements explictly by using
			
 
				-@code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
			
 
				-@code{starpu_perfmodel_plot}
			
 
				-tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
			
 
				-their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
			
 
				-StarPU use the resulting performance model without recording new measures. If
			
 
				-the data input sizes vary a lot, it is really important to set
			
 
				-@code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
			
 
				-measures, and result with a very big performance model, which will take time a
			
 
				-lot of time to load and save.
			
 
				-
			
 
				-For non-linear regression, since computing it
			
 
				+and maximum observed input size. For non-linear regression, since computing it
			
 
				 is quite expensive, it is only done at termination of the application. This
			
 
				-means that the first execution of the application will use only history-based
			
 
				-performance model to perform scheduling, without using regression.
			
 
				+means that the first execution uses history-based performance model to perform
			
 
				+scheduling.
			
 
				 
			
 
				 @item
			
 
				 Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
			
@@ -434,7 +405,7 @@ needs to be called to destroy the dummy task afterwards. See
 
				 @node Theoretical lower bound on execution time
			
 
				 @section Theoretical lower bound on execution time
			
 
				 
			
 
				-For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
			
 
				+For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
			
 
				 bound for the execution time of a whole set of tasks. See for
			
 
				 instance @code{examples/lu/lu_example.c}: before submitting tasks,
			
 
				 call @code{starpu_bound_start}, and after complete execution, call
			
@@ -490,17 +461,12 @@ The arguments following the codelets can be of the following types:
 
				 @item
			
 
				 @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				 @item
			
 
				-@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
			
 
				-@item
			
 
				 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				 @code{STARPU_PRIORITY}, followed by the appropriated objects as
			
 
				 defined below.
			
 
				 @end itemize
			
 
				 
			
 
				-When using @code{STARPU_DATA_ARRAY}, the access mode of the data
			
 
				-handles is not defined.
			
 
				-
			
 
				 Parameters to be passed to the codelet implementation are defined
			
 
				 through the type @code{STARPU_VALUE}. The function
			
 
				 @code{starpu_codelet_unpack_args} must be called within the codelet
			
@@ -600,16 +566,6 @@ task->cl_arg_size = arg_buffer_size;
 
				 int ret = starpu_task_submit(task);
			
 
				 @end smallexample
			
 
				 
			
 
				-Here a similar call using @code{STARPU_DATA_ARRAY}.
			
 
				-
			
 
				-@smallexample
			
 
				-starpu_insert_task(&mycodelet,
			
 
				-                   STARPU_DATA_ARRAY, data_handles, 2,
			
 
				-                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				-                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				-                   0);
			
 
				-@end smallexample
			
 
				-
			
 
				 If some part of the task insertion depends on the value of some computation,
			
 
				 the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
			
 
				 instance, assuming that the index variable @code{i} was registered as handle
			
@@ -696,8 +652,8 @@ and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
 
				 with partitioned vectors:
			
 
				 
			
 
				 @smallexample
			
 
				-int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
			
 
				-         starpu_data_handle_t s, unsigned nblocks)
			
 
				+int dots(starpu_data_handle v1, starpu_data_handle v2,
			
 
				+         starpu_data_handle s, unsigned nblocks)
			
 
				 @{
			
 
				     starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				     for (b = 0; b < nblocks; b++)
			
@@ -712,64 +668,6 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 
				 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
			
 
				 to yet more relaxed dependencies and more parallelism.
			
 
				 
			
 
				-@node Temporary buffers
			
 
				-@section Temporary buffers
			
 
				-
			
 
				-There are two kinds of temporary buffers: temporary data which just pass results
			
 
				-from a task to another, and scratch data which are needed only internally by
			
 
				-tasks.
			
 
				-
			
 
				-@subsection Temporary data
			
 
				-
			
 
				-Data can sometimes be entirely produced by a task, and entirely consumed by
			
 
				-another task, without the need for other parts of the application to access
			
 
				-it. In such case, registration can be done without prior allocation, by using
			
 
				-the special -1 memory node number, and passing a zero pointer. StarPU will
			
 
				-actually allocate memory only when the task creating the content gets scheduled,
			
 
				-and destroy it on unregistration.
			
 
				-
			
 
				-In addition to that, it can be tedious for the application to have to unregister
			
 
				-the data, since it will not use its content anyway. The unregistration can be
			
 
				-done lazily by using the @code{starpu_data_unregister_submit(handle)} function,
			
 
				-which will record that no more tasks accessing the handle will be submitted, so
			
 
				-that it can be freed as soon as the last task accessing it is over.
			
 
				-
			
 
				-The following code examplifies both points: it registers the temporary
			
 
				-data, submits three tasks accessing it, and records the data for automatic
			
 
				-unregistration.
			
 
				-
			
 
				-@smallexample
			
 
				-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				-starpu_insert_task(&produce_data, STARPU_W, handle, 0);
			
 
				-starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
			
 
				-starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				-starpu_data_unregister_submit(handle);
			
 
				-@end smallexample
			
 
				-
			
 
				-@subsection Scratch data
			
 
				-
			
 
				-Some kernels sometimes need temporary data to achieve the computations, i.e. a
			
 
				-workspace. The application could allocate it at the start of the codelet
			
 
				-function, and free it at the end, but that would be costly. It could also
			
 
				-allocate one buffer per worker (similarly to @ref{Per-worker library
			
 
				-initialization }), but that would make them systematic and permanent. A more
			
 
				-optimized way is to use the SCRATCH data access mode, as examplified below,
			
 
				-which provides per-worker buffers without content consistency.
			
 
				-
			
 
				-@smallexample
			
 
				-starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
 
				-for (i = 0; i < N; i++)
			
 
				-    starpu_insert_task(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				-@end smallexample
			
 
				-
			
 
				-StarPU will make sure that the buffer is allocated before executing the task,
			
 
				-and make this allocation per-worker: for CPU workers, notably, each worker has
			
 
				-its own buffer. This means that each task submitted above will actually have its
			
 
				-own workspace, which will actually be the same for all tasks running one after
			
 
				-the other on the same worker. Also, if for instance GPU memory becomes scarce,
			
 
				-StarPU will notice that it can free such buffers easily, since the content does
			
 
				-not matter.
			
 
				-
			
 
				 @node Parallel Tasks
			
 
				 @section Parallel Tasks
			
 
				 
			
@@ -889,20 +787,6 @@ topology (NUMA node, socket, cache, ...) a combined worker will be created. If
 
				 some nodes of the hierarchy have a big arity (e.g. many cores in a socket
			
 
				 without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				 intermediate sizes.
			
 
				-The user can give some hints to StarPU about combined workers sizes to favor.
			
 
				-This can be done by using the environment variables @code{STARPU_MIN_WORKERSIZE}
			
 
				-and @code{STARPU_MAX_WORKERSIZE}. When set, they will force StarPU to create the
			
 
				-biggest combined workers possible without overstepping the defined boundaries.
			
 
				-However, StarPU will create the remaining combined workers without abiding by
			
 
				-the rules if not possible.
			
 
				-For example : if the user specifies a minimum and maximum combined workers size
			
 
				-of 3 on a machine containing 8 CPUs, StarPU will create a combined worker of
			
 
				-size 2 beside the combined workers of size 3.
			
 
				-
			
 
				-The combined workers actually produced can be seen in the output of the
			
 
				-@code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variable
			
 
				-has to be set to a combined worker-aware scheduler such as @code{pheft} or
			
 
				-@code{pgreedy}).
			
 
				 
			
 
				 @subsection Concurrent parallel tasks
			
 
				 
			
@@ -1029,51 +913,30 @@ Graphical-oriented applications need to draw the result of their computations,
 
				 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
			
 
				 interoperability permit to let CUDA directly work on the OpenGL buffers, making
			
 
				 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
			
 
				-renderbuffer objects into CUDA.  CUDA however imposes some technical
			
 
				-constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
			
 
				-to be the one that runs CUDA computations for that GPU.
			
 
				-
			
 
				-To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
			
 
				-to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
			
 
				-first, and the interoperability mode has to
			
 
				-be enabled by using the @code{cuda_opengl_interoperability} field of the
			
 
				-@code{starpu_conf} structure, and the driver loop has to be run by
			
 
				-the application, by using the @code{not_launched_drivers} field of
			
 
				-@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
			
 
				-by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} and
			
 
				-@code{gl_interop_idle} examples shows how it articulates in a simple case, where
			
 
				-rendering is done in task callbacks. The former uses @code{glutMainLoopEvent}
			
 
				-to make GLUT progress from the StarPU driver loop, while the latter uses
			
 
				-@code{glutIdleFunc} to make StarPU progress from the GLUT main loop.
			
 
				-
			
 
				-Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
			
 
				-the CUDA pointer at registration, for instance:
			
 
				+renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
			
 
				+be given the CUDA pointer at registration, for instance:
			
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				-/* Get the CUDA worker id */
			
 
				 for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				                 break;
			
 
				 
			
 
				-/* Build a CUDA pointer pointing at the OpenGL buffer */
			
 
				+cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				-
			
 
				-/* And register it to StarPU */
			
 
				 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				-/* The handle can now be used as usual */
			
 
				 starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				 
			
 
				-/* ... */
			
 
				-
			
 
				-/* This gets back data into the OpenGL buffer */
			
 
				 starpu_data_unregister(handle);
			
 
				+
			
 
				+cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				+cudaGraphicsUnmapResources(1, &resource, 0);
			
 
				+
			
 
				+/* Now display it */
			
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-and display it e.g. in the callback function.
			
 
				-
			
 
				 @node More examples
			
 
				 @section More examples
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -9,7 +9,7 @@
 
				 @menu
			
 
				 * Initialization and Termination::  Initialization and Termination methods
			
 
				 * Workers' Properties::         Methods to enumerate workers' properties
			
 
				-* Data Management::                Methods to manipulate data
			
 
				+* Data Library::                Methods to manipulate data
			
 
				 * Data Interfaces::
			
 
				 * Data Partition::
			
 
				 * Codelets and Tasks::          Methods to construct tasks
			
@@ -36,27 +36,9 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
 
				 indicates that no worker was available (so that StarPU was not initialized).
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftp {Data Type} {struct starpu_driver}
			
 
				-@table @asis
			
 
				-@item @code{enum starpu_archtype type}
			
 
				-The type of the driver. Only STARPU_CPU_DRIVER, STARPU_CUDA_DRIVER and
			
 
				-STARPU_OPENCL_DRIVER are currently supported.
			
 
				-@item @code{union id} Anonymous union
			
 
				-@table @asis
			
 
				-@item @code{unsigned cpu_id}
			
 
				-Should only be used if type is STARPU_CPU_WORKER.
			
 
				-@item @code{unsigned cuda_id}
			
 
				-Should only be used if type is STARPU_CUDA_WORKER.
			
 
				-@item @code{cl_device_id opencl_id}
			
 
				-Should only be used if type is STARPU_OPENCL_WORKER.
			
 
				-@end table
			
 
				-@end table
			
 
				-@end deftp
			
 
				-
			
 
				-
			
 
				 @deftp {Data Type} {struct starpu_conf}
			
 
				 This structure is passed to the @code{starpu_init} function in order
			
 
				-to configure StarPU. It has to be initialized with @code{starpu_conf_init}.
			
 
				+to configure StarPU.
			
 
				 When the default value is used, StarPU automatically selects the number of
			
 
				 processing units and takes the default scheduling policy. The environment
			
 
				 variables overwrite the equivalent parameters.
			
@@ -72,7 +54,7 @@ if @code{sched_policy_name} is set.
 
				 
			
 
				 @item @code{int ncpus} (default = -1)
			
 
				 This is the number of CPU cores that StarPU can use. This can also be
			
 
				-specified with the @code{STARPU_NCPU} environment variable.
			
 
				+specified with the @code{STARPU_NCPUS} environment variable.
			
 
				 
			
 
				 @item @code{int ncuda} (default = -1)
			
 
				 This is the number of CUDA devices that StarPU can use. This can also
			
@@ -123,19 +105,11 @@ contains the logical identifiers of the OpenCL devices to be used.
 
				 
			
 
				 @item @code{int calibrate} (default = 0)
			
 
				 If this flag is set, StarPU will calibrate the performance models when
			
 
				-executing tasks. If this value is equal to @code{-1}, the default value is
			
 
				-used. If the value is equal to @code{1}, it will force continuing
			
 
				-calibration. If the value is equal to @code{2}, the existing performance
			
 
				-models will be overwritten. This can also be specified with the
			
 
				-@code{STARPU_CALIBRATE} environment variable.
			
 
				-
			
 
				-@item @code{int bus_calibrate} (default = 0)
			
 
				-If this flag is set, StarPU will recalibrate the bus.  If this value is equal
			
 
				-to @code{-1}, the default value is used.  This can also be specified with the
			
 
				-@code{STARPU_BUS_CALIBRATE} environment variable.
			
 
				+executing tasks. If this value is equal to -1, the default value is used. This
			
 
				+can also be specified with the @code{STARPU_CALIBRATE} environment variable.
			
 
				 
			
 
				 @item @code{int single_combined_worker} (default = 0)
			
 
				-By default, StarPU executes parallel tasks concurrently.
			
 
				+By default, StarPU parallel tasks concurrently.
			
 
				 Some parallel libraries (e.g. most OpenMP implementations) however do
			
 
				 not support concurrent calls to parallel code. In such case, setting this flag
			
 
				 makes StarPU only start one parallel task at a time.
			
@@ -143,46 +117,11 @@ This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environ
 
				 
			
 
				 @item @code{int disable_asynchronous_copy} (default = 0)
			
 
				 This flag should be set to 1 to disable asynchronous copies between
			
 
				-CPUs and all accelerators. This can also be specified with the
			
 
				+CPUs and accelerators. This can also be specified with the
			
 
				 @code{STARPU_DISABLE_ASYNCHRONOUS_COPY} environment variable.
			
 
				 The AMD implementation of OpenCL is known to
			
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				-This can also be specified at compilation time by giving to the
			
 
				-configure script the option @code{--disable-asynchronous-copy}.
			
 
				-
			
 
				-@item @code{int disable_cuda_asynchronous_copy} (default = 0)
			
 
				-This flag should be set to 1 to disable asynchronous copies between
			
 
				-CPUs and CUDA accelerators. This can also be specified with the
			
 
				-@code{STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY} environment variable.
			
 
				-This can also be specified at compilation time by giving to the
			
 
				-configure script the option @code{--disable-asynchronous-cuda-copy}.
			
 
				-
			
 
				-@item @code{int disable_opencl_asynchronous_copy} (default = 0)
			
 
				-This flag should be set to 1 to disable asynchronous copies between
			
 
				-CPUs and OpenCL accelerators. This can also be specified with the
			
 
				-@code{STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY} environment variable.
			
 
				-The AMD implementation of OpenCL is known to
			
 
				-fail when copying data asynchronously. When using this implementation,
			
 
				-it is therefore necessary to disable asynchronous data transfers.
			
 
				-This can also be specified at compilation time by giving to the
			
 
				-configure script the option @code{--disable-asynchronous-opencl-copy}.
			
 
				-
			
 
				-@item @code{int *cuda_opengl_interoperability} (default = NULL)
			
 
				-This can be set to an array of CUDA device identifiers for which
			
 
				-@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
			
 
				-size is specified by the @code{n_cuda_opengl_interoperability} field below
			
 
				-
			
 
				-@item @code{int *n_cuda_opengl_interoperability} (default = 0)
			
 
				-This has to be set to the size of the array pointed to by the
			
 
				-@code{cuda_opengl_interoperability} field.
			
 
				-
			
 
				-@item @code{struct starpu_driver *not_launched_drivers}
			
 
				-The drivers that should not be launched by StarPU.
			
 
				-
			
 
				-@item @code{unsigned nnot_launched_drivers}
			
 
				-The number of StarPU drivers that should not be launched by StarPU.
			
 
				-
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
@@ -192,7 +131,7 @@ with the default values. In case some configuration parameters are already
 
				 specified through environment variables, @code{starpu_conf_init} initializes
			
 
				 the fields of the structure according to the environment variables. For
			
 
				 instance if @code{STARPU_CALIBRATE} is set, its value is put in the
			
 
				-@code{.calibrate} field of the structure passed as argument.
			
 
				+@code{.ncuda} field of the structure passed as argument.
			
 
				 
			
 
				 Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
			
 
				 indicates that the argument was NULL.
			
@@ -209,16 +148,6 @@ Return 1 if asynchronous data transfers between CPU and accelerators
 
				 are disabled.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_asynchronous_cuda_copy_disabled ()
			
 
				-Return 1 if asynchronous data transfers between CPU and CUDA accelerators
			
 
				-are disabled.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_asynchronous_opencl_copy_disabled ()
			
 
				-Return 1 if asynchronous data transfers between CPU and OpenCL accelerators
			
 
				-are disabled.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @node Workers' Properties
			
 
				 @section Workers' Properties
			
 
				 
			
@@ -238,8 +167,8 @@ StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})
			
 
				-Returns the number of workers of the given @var{type}. A positive
			
 
				-(or @code{NULL}) value is returned in case of success, @code{-EINVAL} indicates that
			
 
				+Returns the number of workers of the given type indicated by the argument. A positive
			
 
				+(or null) value is returned in case of success, @code{-EINVAL} indicates that
			
 
				 the type is not valid otherwise.
			
 
				 @end deftypefun
			
 
				 
			
@@ -335,12 +264,12 @@ this function should be used in the allocation function to determine
 
				 on which device the memory needs to be allocated.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@node Data Management
			
 
				-@section Data Management
			
 
				+@node Data Library
			
 
				+@section Data Library
			
 
				 
			
 
				 @menu
			
 
				-* Introduction to Data Management::
			
 
				-* Basic Data Management API::
			
 
				+* Introduction to Data Library::
			
 
				+* Basic Data Library API::
			
 
				 * Access registered data from the application::
			
 
				 @end menu
			
 
				 
			
@@ -349,7 +278,7 @@ This section describes the data management facilities provided by StarPU.
 
				 We show how to use existing data interfaces in @ref{Data Interfaces}, but developers can
			
 
				 design their own data interfaces if required.
			
 
				 
			
 
				-@node Introduction to Data Management
			
 
				+@node Introduction to Data Library
			
 
				 @subsection Introduction
			
 
				 Data management is done at a high-level in StarPU: rather than accessing a mere
			
 
				 list of contiguous buffers, the tasks may manipulate data that are described by
			
@@ -377,8 +306,8 @@ to StarPU, the specified memory node indicates where the piece of data
 
				 initially resides (we also call this memory node the home node of a piece of
			
 
				 data).
			
 
				 
			
 
				-@node Basic Data Management API
			
 
				-@subsection Basic Data Management API
			
 
				+@node Basic Data Library API
			
 
				+@subsection Basic Data Library API
			
 
				 
			
 
				 @deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
			
 
				 This function allocates data of the given size in main memory. It will also try to pin it in
			
@@ -468,10 +397,6 @@ access to the handle must be performed in write-only mode. Accessing an
 
				 invalidated data in read-mode results in undefined behaviour.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_data_invalidate_submit (starpu_data_handle_t @var{handle})
			
 
				-Submits invalidation of the data handle after completion of previously submitted tasks.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @c TODO create a specific sections about user interaction with the DSM ?
			
 
				 
			
 
				 @deftypefun void starpu_data_set_wt_mask (starpu_data_handle_t @var{handle}, uint32_t @var{wt_mask})
			
@@ -527,7 +452,7 @@ be consistent with the access mode specified in the @var{mode} argument.
 
				 access the piece of data anymore.  Note that implicit data
			
 
				 dependencies are also enforced by @code{starpu_data_acquire}, i.e.
			
 
				 @code{starpu_data_acquire} will wait for all tasks scheduled to work on
			
 
				-the data, unless they have been disabled explictly by calling
			
 
				+the data, unless that they have not been disabled explictly by calling
			
 
				 @code{starpu_data_set_default_sequential_consistency_flag} or
			
 
				 @code{starpu_data_set_sequential_consistency_flag}.
			
 
				 @code{starpu_data_acquire} is a blocking call, so that it cannot be called from
			
@@ -538,28 +463,18 @@ tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
 
				 
			
 
				 @deftypefun int starpu_data_acquire_cb (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
			
 
				 @code{starpu_data_acquire_cb} is the asynchronous equivalent of
			
 
				-@code{starpu_data_acquire}. When the data specified in the first argument is
			
 
				+@code{starpu_data_release}. When the data specified in the first argument is
			
 
				 available in the appropriate access mode, the callback function is executed.
			
 
				 The application may access the requested data during the execution of this
			
 
				 callback. The callback function must call @code{starpu_data_release} once the
			
 
				 application does not need to access the piece of data anymore.
			
 
				 Note that implicit data dependencies are also enforced by
			
 
				-@code{starpu_data_acquire_cb} in case they are not disabled.
			
 
				+@code{starpu_data_acquire_cb} in case they are enabled.
			
 
				  Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
			
 
				 be called from task callbacks. Upon successful completion, this function
			
 
				 returns 0.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_data_acquire_on_node (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode})
			
 
				-This is the same as @code{starpu_data_acquire}, except that the data will be
			
 
				-available on the given memory node instead of main memory.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_data_acquire_on_node_cb (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
			
 
				-This is the same as @code{starpu_data_acquire_cb}, except that the data will be
			
 
				-available on the given memory node instead of main memory.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @defmac STARPU_DATA_ACQUIRE_CB (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, code)
			
 
				 @code{STARPU_DATA_ACQUIRE_CB} is the same as @code{starpu_data_acquire_cb},
			
 
				 except that the code to be executed in a callback is directly provided as a
			
@@ -663,12 +578,12 @@ starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
 
				 @deftypefun void starpu_bcsr_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})
			
 
				 This variant of @code{starpu_data_register} uses the BCSR (Blocked
			
 
				 Compressed Sparse Row Representation) sparse matrix interface.
			
 
				-Register the sparse matrix made of @var{nnz} non-zero blocks of elements of size
			
 
				+Register the sparse matrix made of @var{nnz} non-zero values of size
			
 
				 @var{elemsize} stored in @var{nzval} and initializes @var{handle} to represent
			
 
				 it. Blocks have size @var{r} * @var{c}. @var{nrow} is the number of rows (in
			
 
				-terms of blocks), @code{colind[i]} is the block-column index for block @code{i}
			
 
				-in @code{nzval}, @code{rowptr[i]} is the block-index (in nzval) of the first block of row @code{i}.
			
 
				-@var{firstentry} is the index of the first entry of the given arrays (usually 0
			
 
				+terms of blocks), @var{colind} is the list of positions of the non-zero entries
			
 
				+on the row, @var{rowptr} is the index (in nzval) of the first entry of the row.
			
 
				+@var{fristentry} is the index of the first entry of the given arrays (usually 0
			
 
				 or 1).
			
 
				 @end deftypefun
			
 
				 
			
@@ -732,23 +647,6 @@ if @var{handle}'s interface does not have data allocated locally
 
				 Return the unique identifier of the interface associated with the given @var{handle}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun size_t starpu_handle_get_size (starpu_data_handle_t @var{handle})
			
 
				-Return the size of the data associated with @var{handle}
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr})
			
 
				-Allocates a buffer large enough at @var{ptr} and copy to the newly
			
 
				-allocated buffer the data associated to @var{handle}. The interface of
			
 
				-the data registered at @var{handle} must define a packing operation
			
 
				-(@pxref{struct starpu_data_interface_ops}).
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
			
 
				-Copy in @var{handle} the data located at @var{ptr} as described by the
			
 
				-interface of the data. The interface registered at @var{handle} must
			
 
				-define a unpacking operation (@pxref{struct starpu_data_interface_ops}).
			
 
				-@end deftypefun
			
 
				-
			
 
				 @node Accessing Variable Data Interfaces
			
 
				 @subsubsection Variable Data Interfaces
			
 
				 
			
@@ -1110,7 +1008,7 @@ subdata according to the filter @var{f}, as shown in the following example:
 
				 @cartouche
			
 
				 @smallexample
			
 
				 struct starpu_data_filter f = @{
			
 
				-    .filter_func = starpu_block_filter_func,
			
 
				+    .filter_func = starpu_vertical_block_filter_func,
			
 
				     .nchildren = nslicesx,
			
 
				     .get_nchildren = NULL,
			
 
				     .get_child_ops = NULL
			
@@ -1122,8 +1020,7 @@ starpu_data_partition(A_handle, &f);
 
				 
			
 
				 @deftypefun void starpu_data_unpartition (starpu_data_handle_t @var{root_data}, uint32_t @var{gathering_node})
			
 
				 This unapplies one filter, thus unpartitioning the data. The pieces of data are
			
 
				-collected back into one big piece in the @var{gathering_node} (usually 0). Tasks
			
 
				-working on the partitioned data must be already finished when calling @code{starpu_data_unpartition}.
			
 
				+collected back into one big piece in the @var{gathering_node} (usually 0).
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				 starpu_data_unpartition(A_handle, 0);
			
@@ -1176,16 +1073,38 @@ starpu_data_filter.
 
				 @subsection Predefined filter functions
			
 
				 
			
 
				 @menu
			
 
				-* Partitioning Vector Data::
			
 
				-* Partitioning Matrix Data::
			
 
				-* Partitioning 3D Matrix Data::
			
 
				 * Partitioning BCSR Data::
			
 
				+* Partitioning BLAS interface::
			
 
				+* Partitioning Vector Data::
			
 
				+* Partitioning Block Data::
			
 
				 @end menu
			
 
				 
			
 
				 This section gives a partial list of the predefined partitioning functions.
			
 
				 Examples on how to use them are shown in @ref{Partitioning Data}. The complete
			
 
				 list can be found in @code{starpu_data_filters.h} .
			
 
				 
			
 
				+@node Partitioning BCSR Data
			
 
				+@subsubsection Partitioning BCSR Data
			
 
				+
			
 
				+@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+This partitions a block-sparse matrix into dense matrices.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+This partitions a block-sparse matrix into vertical block-sparse matrices.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Partitioning BLAS interface
			
 
				+@subsubsection Partitioning BLAS interface
			
 
				+
			
 
				+@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+This partitions a dense Matrix into horizontal blocks.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+This partitions a dense Matrix into vertical blocks.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @node Partitioning Vector Data
			
 
				 @subsubsection Partitioning Vector Data
			
 
				 
			
@@ -1195,18 +1114,6 @@ vector represented by @var{father_interface} once partitioned in
 
				 @var{nparts} chunks of equal size.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-Return in @code{*@var{child_interface}} the @var{id}th element of the
			
 
				-vector represented by @var{father_interface} once partitioned in
			
 
				-@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow 
			
 
				-
			
 
				-The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-
			
 
				-A usage example is available in examples/filters/shadow.c
			
 
				-@end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 Return in @code{*@var{child_interface}} the @var{id}th element of the
			
@@ -1227,107 +1134,11 @@ chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
 
				 @end deftypefun
			
 
				 
			
 
				 
			
 
				-@node Partitioning Matrix Data
			
 
				-@subsubsection Partitioning Matrix Data
			
 
				-
			
 
				-@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a dense Matrix along the x dimension, thus getting (x/nparts,y)
			
 
				-matrices. If nparts does not divide x, the last submatrix contains the
			
 
				-remainder.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a dense Matrix along the x dimension, with a shadow border
			
 
				-@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y)
			
 
				-matrices. If nparts does not divide x-2*shadow, the last submatrix contains the
			
 
				-remainder.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-
			
 
				-A usage example is available in examples/filters/shadow2d.c
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a dense Matrix along the y dimension, thus getting (x,y/nparts)
			
 
				-matrices. If nparts does not divide y, the last submatrix contains the
			
 
				-remainder.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_vertical_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a dense Matrix along the y dimension, with a shadow border
			
 
				-@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow)
			
 
				-matrices. If nparts does not divide y-2*shadow, the last submatrix contains the
			
 
				-remainder.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-
			
 
				-A usage example is available in examples/filters/shadow2d.c
			
 
				-@end deftypefun
			
 
				-
			
 
				-@node Partitioning 3D Matrix Data
			
 
				-@subsubsection Partitioning 3D Matrix Data
			
 
				-
			
 
				-A usage example is available in examples/filters/shadow3d.c
			
 
				+@node Partitioning Block Data
			
 
				+@subsubsection Partitioning Block Data
			
 
				 
			
 
				 @deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the X dimension, thus getting (x/nparts,y,z)
			
 
				-3D matrices. If nparts does not divide x, the last submatrix contains the
			
 
				-remainder.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the X dimension, with a shadow border
			
 
				-@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y,z) 3D
			
 
				-matrices. If nparts does not divide x, the last submatrix contains the
			
 
				-remainder.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_vertical_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the Y dimension, thus getting (x,y/nparts,z)
			
 
				-3D matrices. If nparts does not divide y, the last submatrix contains the
			
 
				-remainder.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_vertical_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the Y dimension, with a shadow border
			
 
				-@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow,z) 3D
			
 
				-matrices. If nparts does not divide y, the last submatrix contains the
			
 
				-remainder.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_depth_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the Z dimension, thus getting (x,y,z/nparts)
			
 
				-3D matrices. If nparts does not divide z, the last submatrix contains the
			
 
				-remainder.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_depth_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a 3D matrix along the Z dimension, with a shadow border
			
 
				-@code{filter_arg_ptr}, thus getting (x,y,(z-2*shadow)/nparts+2*shadow)
			
 
				-3D matrices. If nparts does not divide z, the last submatrix contains the
			
 
				-remainder.
			
 
				-
			
 
				-IMPORTANT: This can only be used for read-only access, as no coherency is
			
 
				-enforced for the shadowed parts.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@node Partitioning BCSR Data
			
 
				-@subsubsection Partitioning BCSR Data
			
 
				-
			
 
				-@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a block-sparse matrix into dense matrices.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a block-sparse matrix into vertical block-sparse matrices.
			
 
				+This partitions a 3D matrix along the X axis.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @node Codelets and Tasks
			
@@ -1402,9 +1213,7 @@ always only define the field @code{opencl_funcs}.
 
				 
			
 
				 @deftp {Data Type} {struct starpu_codelet}
			
 
				 The codelet structure describes a kernel that is possibly implemented on various
			
 
				-targets. For compatibility, make sure to initialize the whole structure to zero,
			
 
				-either by using explicit memset, or by letting the compiler implicitly do it in
			
 
				-e.g. static storage case.
			
 
				+targets. For compatibility, make sure to initialize the whole structure to zero.
			
 
				 
			
 
				 @table @asis
			
 
				 @item @code{uint32_t where} (optional)
			
@@ -1420,9 +1229,7 @@ unset, its value will be automatically set based on the availability
 
				 of the @code{XXX_funcs} fields defined below.
			
 
				 
			
 
				 @item @code{int (*can_execute)(unsigned workerid, struct starpu_task *task, unsigned nimpl)} (optional)
			
 
				-Defines a function which should return 1 if the worker designated by
			
 
				-@var{workerid} can execute the @var{nimpl}th implementation of the
			
 
				-given @var{task}, 0 otherwise.
			
 
				+Defines a function which should return 1 if the worker designated by @var{workerid} can execute the @var{nimpl}th implementation of the given@var{task}, 0 otherwise.
			
 
				 
			
 
				 @item @code{enum starpu_codelet_type type} (optional)
			
 
				 The default is @code{STARPU_SEQ}, i.e. usual sequential implementation. Other
			
@@ -1504,13 +1311,11 @@ option when configuring StarPU.
 
				 
			
 
				 @item @code{struct starpu_perfmodel *model} (optional)
			
 
				 This is a pointer to the task duration performance model associated to this
			
 
				-codelet. This optional field is ignored when set to @code{NULL} or
			
 
				-when its @code{symbol} field is not set.
			
 
				+codelet. This optional field is ignored when set to @code{NULL}.
			
 
				 
			
 
				 @item @code{struct starpu_perfmodel *power_model} (optional)
			
 
				 This is a pointer to the task power consumption performance model associated
			
 
				-to this codelet. This optional field is ignored when set to
			
 
				-@code{NULL} or when its @code{symbol} field is not set.
			
 
				+to this codelet. This optional field is ignored when set to @code{NULL}.
			
 
				 In the case of parallel codelets, this has to account for all processing units
			
 
				 involved in the parallel execution.
			
 
				 
			
@@ -1609,10 +1414,9 @@ codelets, where the @code{cl_arg} pointer is given as such.
 
				 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
			
 
				 This is a function pointer of prototype @code{void (*f)(void *)} which
			
 
				 specifies a possible callback. If this pointer is non-null, the callback
			
 
				-function is executed @emph{on the host} after the execution of the task. Tasks
			
 
				-which depend on it might already be executing. The callback is passed the
			
 
				-value contained in the @code{callback_arg} field. No callback is executed if the
			
 
				-field is set to @code{NULL}.
			
 
				+function is executed @emph{on the host} after the execution of the task. The
			
 
				+callback is passed the value contained in the @code{callback_arg} field. No
			
 
				+callback is executed if the field is set to @code{NULL}.
			
 
				 
			
 
				 @item @code{void *callback_arg} (optional) (default: @code{NULL})
			
 
				 This is the pointer passed to the callback function. This field is ignored if
			
@@ -1716,7 +1520,7 @@ submitted if it has not been properly initialized.
 
				 Initialize @var{task} with default values. This function is implicitly
			
 
				 called by @code{starpu_task_create}. By default, tasks initialized with
			
 
				 @code{starpu_task_init} must be deinitialized explicitly with
			
 
				-@code{starpu_task_clean}. Tasks can also be initialized statically,
			
 
				+@code{starpu_task_deinit}. Tasks can also be initialized statically,
			
 
				 using @code{STARPU_TASK_INITIALIZER} defined below.
			
 
				 @end deftypefun
			
 
				 
			
@@ -1737,14 +1541,11 @@ by the task have to be freed by calling
 
				 @code{starpu_task_destroy}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_task_clean ({struct starpu_task} *@var{task})
			
 
				+@deftypefun void starpu_task_deinit ({struct starpu_task} *@var{task})
			
 
				 Release all the structures automatically allocated to execute @var{task}, but
			
 
				-not the task structure itself and values set by the user remain unchanged.
			
 
				-It is thus useful for statically allocated tasks for instance.
			
 
				-It is also useful when the user wants to execute the same operation several
			
 
				-times with as least overhead as possible.
			
 
				-It is called automatically by @code{starpu_task_destroy}.
			
 
				-It has to be called only after explicitly waiting for the task or after
			
 
				+not the task structure itself. It is thus useful for statically allocated tasks
			
 
				+for instance.  It is called automatically by @code{starpu_task_destroy}.  It
			
 
				+has to be called only after explicitly waiting for the task or after
			
 
				 @code{starpu_shutdown} (waiting for the callback is not enough, since starpu
			
 
				 still manipulates the task after calling the callback).
			
 
				 @end deftypefun
			
@@ -1781,10 +1582,6 @@ function for instance.
 
				 In case of success, this function returns 0, a return value of @code{-ENODEV}
			
 
				 means that there is no worker able to process this task (e.g. there is no GPU
			
 
				 available and this task is only implemented for CUDA devices).
			
 
				-
			
 
				-starpu_task_submit() can be called from anywhere, including codelet
			
 
				-functions and callbacks, provided that the @code{synchronous} field of the
			
 
				-@code{starpu_task} structure is left to 0.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun int starpu_task_wait_for_all (void)
			
@@ -1889,14 +1686,6 @@ This function is similar to @code{starpu_tag_wait} except that it blocks until
 
				 terminated.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_tag_restart (starpu_tag_t @var{id})
			
 
				-This function can be used to clear the "already notified" status
			
 
				-of a tag which is not associated with a task. Before that, calling
			
 
				-@code{starpu_tag_notify_from_apps} again will not notify the successors. After
			
 
				-that, the next call to @code{starpu_tag_notify_from_apps} will notify the
			
 
				-successors.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_tag_remove (starpu_tag_t @var{id})
			
 
				 This function releases the resources associated to tag @var{id}. It can be
			
 
				 called once the corresponding task has been executed and when there is
			
@@ -1908,10 +1697,7 @@ This function explicitly unlocks tag @var{id}. It may be useful in the
 
				 case of applications which execute part of their computation outside StarPU
			
 
				 tasks (e.g. third-party libraries).  It is also provided as a
			
 
				 convenient tool for the programmer, for instance to entirely construct the task
			
 
				-DAG before actually giving StarPU the opportunity to execute the tasks. When
			
 
				-called several times on the same tag, notification will be done only on first
			
 
				-call, thus implementing "OR" dependencies, until the tag is restarted using
			
 
				-@code{starpu_tag_restart}.
			
 
				+DAG before actually giving StarPU the opportunity to execute the tasks.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @node Implicit Data Dependencies
			
@@ -1976,11 +1762,7 @@ The possible values are:
 
				 @anchor{struct starpu_perfmodel}
			
 
				 contains all information about a performance model. At least the
			
 
				 @code{type} and @code{symbol} fields have to be filled when defining a
			
 
				-performance model for a codelet. For compatibility, make sure to initialize the
			
 
				-whole structure to zero, either by using explicit memset, or by letting the
			
 
				-compiler implicitly do it in e.g. static storage case.
			
 
				-
			
 
				-If not provided, other fields have to be zero.
			
 
				+performance model for a codelet. If not provided, other fields have to be zero.
			
 
				 
			
 
				 @table @asis
			
 
				 @item @code{type}
			
@@ -1995,8 +1777,7 @@ archs will be determined by multiplying by an arch-specific factor.
 
				 
			
 
				 @item @code{const char *symbol}
			
 
				 is the symbol name for the performance model, which will be used as
			
 
				-file name to store the model. It must be set otherwise the model will
			
 
				-be ignored.
			
 
				+file name to store the model.
			
 
				 
			
 
				 @item @code{double (*cost_model)(struct starpu_buffer_descr *)}
			
 
				 This field is deprecated. Use instead the @code{cost_function} field.
			
@@ -2011,7 +1792,7 @@ Used by @code{STARPU_HISTORY_BASED} and
 
				 implementation number, and returns the size to be used as index for
			
 
				 history and regression.
			
 
				 
			
 
				-@item @code{struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
			
 
				+@item @code{struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
			
 
				 Used by @code{STARPU_PER_ARCH}: array of @code{struct
			
 
				 starpu_per_arch_perfmodel} structures.
			
 
				 
			
@@ -2028,7 +1809,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
 
				-@deftp {Data Type} {struct starpu_perfmodel_regression_model}
			
 
				+@deftp {Data Type} {struct starpu_regression_model}
			
 
				 @table @asis
			
 
				 @item @code{double sumlny} sum of ln(measured)
			
 
				 @item @code{double sumlnx} sum of ln(size)
			
@@ -2045,7 +1826,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
 
				-@deftp {Data Type} {struct starpu_perfmodel_per_arch}
			
 
				+@deftp {Data Type} {struct starpu_per_arch_perfmodel}
			
 
				 contains information about the performance model of a given arch.
			
 
				 
			
 
				 @table @asis
			
@@ -2066,11 +1847,11 @@ case it depends on the architecture-specific implementation.
 
				 @item @code{struct starpu_htbl32_node *history}
			
 
				 The history of performance measurements.
			
 
				 
			
 
				-@item @code{struct starpu_perfmodel_history_list *list}
			
 
				+@item @code{struct starpu_history_list *list}
			
 
				 Used by @code{STARPU_HISTORY_BASED} and @code{STARPU_NL_REGRESSION_BASED},
			
 
				 records all execution history measures.
			
 
				 
			
 
				-@item @code{struct starpu_perfmodel_regression_model regression}
			
 
				+@item @code{struct starpu_regression_model regression}
			
 
				 Used by @code{STARPU_HISTORY_REGRESION_BASED} and
			
 
				 @code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
			
 
				 regression.
			
@@ -2078,7 +1859,7 @@ regression.
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
 
				-@deftypefun int starpu_perfmodel_load_symbol ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
			
 
				+@deftypefun int starpu_load_history_debug ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
			
 
				 loads a given performance model. The @var{model} structure has to be completely zero, and will be filled with the information saved in @code{~/.starpu}.
			
 
				 @end deftypefun
			
 
				 
			
@@ -2090,42 +1871,22 @@ returns the path to the debugging information for the performance model.
 
				 returns the architecture name for @var{arch}.
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun void starpu_force_bus_sampling (void)
			
 
				+forces sampling the bus performance model again.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @deftypefun {enum starpu_perf_archtype} starpu_worker_get_perf_archtype (int @var{workerid})
			
 
				 returns the architecture type of a given worker.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_perfmodel_list ({FILE *}@var{output})
			
 
				+@deftypefun int starpu_list_models ({FILE *}@var{output})
			
 
				 prints a list of all performance models on @var{output}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_perfmodel_print ({struct starpu_perfmodel *}@var{model}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl}, {char *}@var{parameter}, {uint32_t *}footprint, {FILE *}@var{output})
			
 
				-todo
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_perfmodel_print_all ({struct starpu_perfmodel *}@var{model}, {char *}@var{arch}, @var{char *}parameter,  {uint32_t *}@var{footprint}, {FILE *}@var{output})
			
 
				-todo
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_bus_print_bandwidth ({FILE *}@var{f})
			
 
				 prints a matrix of bus bandwidths on @var{f}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_bus_print_affinity ({FILE *}@var{f})
			
 
				-prints the affinity devices on @var{f}.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_topology_print ({FILE *}@var{f})
			
 
				-prints a description of the topology on @var{f}.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_perfmodel_update_history ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{cpuid}, unsigned @var{nimpl}, double @var{measured});
			
 
				-This feeds the performance model @var{model} with an explicit measurement
			
 
				-@var{measured}, in addition to measurements done by StarPU itself. This can be
			
 
				-useful when the application already has an existing set of measurements done
			
 
				-in good conditions, that StarPU could benefit from instead of doing on-line
			
 
				-measurements. And example of use can be see in @ref{Performance model example}.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @node Profiling API
			
 
				 @section Profiling API
			
 
				 
			
@@ -2345,23 +2106,6 @@ Calls starpu_cuda_report_error, passing the current function, file and line
 
				 position.
			
 
				 @end defmac
			
 
				 
			
 
				-@deftypefun int starpu_cuda_copy_async_sync ({void *}@var{src_ptr}, unsigned @var{src_node}, {void *}@var{dst_ptr}, unsigned @var{dst_node}, size_t @var{ssize}, cudaStream_t @var{stream}, {enum cudaMemcpyKind} @var{kind})
			
 
				-Copy @var{ssize} bytes from the pointer @var{src_ptr} on
			
 
				-@var{src_node} to the pointer @var{dst_ptr} on @var{dst_node}.
			
 
				-The function first tries to copy the data asynchronous (unless
			
 
				-@var{stream} is @code{NULL}. If the asynchronous copy fails or if
			
 
				-@var{stream} is @code{NULL}, it copies the data synchronously.
			
 
				-The function returns @code{-EAGAIN} if the asynchronous copy was
			
 
				-successfull. It returns 0 if the synchronous copy was successful, or
			
 
				-fails otherwise.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_cuda_set_device (int @var{devid})
			
 
				-Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
			
 
				-whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
			
 
				-the @code{starpu_conf} structure.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_helper_cublas_init (void)
			
 
				 This function initializes CUBLAS on every CUDA device.
			
 
				 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
			
@@ -2431,23 +2175,8 @@ Return the computation kernel command queue of the current worker.
 
				 Sets the arguments of a given kernel. The list of arguments must be given as
			
 
				 (size_t @var{size_of_the_argument}, cl_mem * @var{pointer_to_the_argument}).
			
 
				 The last argument must be 0. Returns the number of arguments that were
			
 
				-successfully set. In case of failure, returns the id of the argument
			
 
				-that could not be set and @var{err} is set to the error returned by
			
 
				-OpenCL. Otherwise, returns the number of arguments that were set.
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-int n;
			
 
				-cl_int err;
			
 
				-cl_kernel kernel;
			
 
				-n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
			
 
				-                                  sizeof(foo), &foo,
			
 
				-                                  sizeof(bar), &bar,
			
 
				-                                  0);
			
 
				-if (n != 2)
			
 
				-   fprintf(stderr, "Error : %d\n", err);
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				+successfully set. In case of failure, @var{err} is set to the error returned by
			
 
				+OpenCL.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @node Compiling OpenCL kernels
			
@@ -2483,43 +2212,6 @@ This function compiles an OpenCL source code stored in a string.
 
				 This function unloads an OpenCL compiled code.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
			
 
				-Store the contents of the file @var{source_file_name} in the buffer
			
 
				-@var{opencl_program_source}. The file @var{source_file_name} can be
			
 
				-located in the current directory, or in the directory specified by the
			
 
				-environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, or in the
			
 
				-directory @code{share/starpu/opencl} of the installation directory of
			
 
				-StarPU, or in the source directory of StarPU.
			
 
				-When the file is found, @code{located_file_name} is the full name of
			
 
				-the file as it has been located on the system, @code{located_dir_name}
			
 
				-the directory where it has been located. Otherwise, they are both set
			
 
				-to the empty string.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_opencl_compile_opencl_from_file ({const char *}@var{source_file_name}, {const char*} @var{build_options})
			
 
				-Compile the OpenCL kernel stored in the file @code{source_file_name}
			
 
				-with the given options @code{build_options} and stores the result in
			
 
				-the directory @code{$STARPU_HOME/.starpu/opencl} with the same
			
 
				-filename as @code{source_file_name}. The compilation is done for every
			
 
				-OpenCL device, and the filename is suffixed with the vendor id and the
			
 
				-device id of the OpenCL device.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_opencl_compile_opencl_from_string ({const char *}@var{opencl_program_source}, {const char *}@var{file_name}, {const char* }@var{build_options})
			
 
				-Compile the OpenCL kernel in the string @code{opencl_program_source}
			
 
				-with the given options @code{build_options} and stores the result in
			
 
				-the directory @code{$STARPU_HOME/.starpu/opencl} with the filename
			
 
				-@code{file_name}. The compilation is done for every
			
 
				-OpenCL device, and the filename is suffixed with the vendor id and the
			
 
				-device id of the OpenCL device.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun int starpu_opencl_load_binary_opencl ({const char *}@var{kernel_id}, {struct starpu_opencl_program *}@var{opencl_programs})
			
 
				-Compile the binary OpenCL kernel identified with @var{id}. For every
			
 
				-OpenCL device, the binary OpenCL kernel will be loaded from the file
			
 
				-@code{$STARPU_HOME/.starpu/opencl/<kernel_id>.<device_type>.vendor_id_<vendor_id>_device_id_<device_id>}.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @node Loading OpenCL kernels
			
 
				 @subsection Loading OpenCL kernels
			
 
				 
			
@@ -2546,11 +2238,6 @@ collect statistics about the kernel execution (used cycles, consumed power).
 
				 @node OpenCL utilities
			
 
				 @subsection OpenCL utilities
			
 
				 
			
 
				-@deftypefun {const char *} starpu_opencl_error_string (cl_int @var{status})
			
 
				-Return the error message in English corresponding to @var{status}, an
			
 
				-OpenCL error code.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_opencl_display_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, {const char *}@var{msg}, cl_int @var{status})
			
 
				 Given a valid error @var{status}, prints the corresponding error message on
			
 
				 stdout, along with the given function name @var{func}, the given filename
			
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -161,9 +161,7 @@ struct starpu_codelet cl =
 
				 
			
 
				 A codelet is a structure that represents a computational kernel. Such a codelet
			
 
				 may contain an implementation of the same kernel on different architectures
			
 
				-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
			
 
				-structure is initialized to zero, either by using memset, or by letting the
			
 
				-compiler implicitly do it as examplified above.
			
 
				+(e.g. CUDA, Cell's SPU, x86, ...).
			
 
				 
			
 
				 The @code{nbuffers} field specifies the number of data buffers that are
			
 
				 manipulated by the codelet: here the codelet does not access or modify any data
			
@@ -358,8 +356,9 @@ main (void)
 
				 #define FACTOR 3.14
			
 
				 
			
 
				   @{
			
 
				-    float vector[NX]
			
 
				-       __attribute__ ((heap_allocated, registered));
			
 
				+    float vector[NX] __attribute__ ((heap_allocated));
			
 
				+
			
 
				+#pragma starpu register vector
			
 
				 
			
 
				     size_t i;
			
 
				     for (i = 0; i < NX; i++)
			
@@ -437,6 +436,9 @@ in our C file like this:
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				+/* Include StarPU's OpenCL integration.  */
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				 /* The OpenCL programs, loaded from `main' (see below).  */
			
 
				 static struct starpu_opencl_program cl_programs;
			
 
				 
			
@@ -532,6 +534,7 @@ the CUDA Kernel}).
 
				    with `nvcc'.  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include <stdlib.h>
			
 
				 
			
 
				 static __global__ void
			
@@ -719,6 +722,7 @@ call.
 
				 @cartouche
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
@@ -772,6 +776,7 @@ important when using partitioning, see @ref{Partitioning Data}.
 
				 @cartouche
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				+@i{#include <starpu_opencl.h>}
			
 
				 
			
 
				 @i{extern struct starpu_opencl_program programs;}
			
 
				 
			
@@ -965,7 +970,7 @@ and to execute it, with the default configuration:
 
				 or for example, by disabling CPU devices:
			
 
				 
			
 
				 @smallexample
			
 
				-% STARPU_NCPU=0 ./vector_scal
			
 
				+% STARPU_NCPUS=0 ./vector_scal
			
 
				 0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				 @end smallexample
			
 
				 
			
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -295,7 +295,6 @@ The following pragmas are provided:
 
				 @item #pragma starpu register @var{ptr} [@var{size}]
			
 
				 Register @var{ptr} as a @var{size}-element buffer.  When @var{ptr} has
			
 
				 an array type whose size is known, @var{size} may be omitted.
			
 
				-Alternatively, the @code{registered} attribute can be used (see below.)
			
 
				 
			
 
				 @item #pragma starpu unregister @var{ptr}
			
 
				 Unregister the previously-registered memory area pointed to by
			
@@ -312,25 +311,17 @@ making it available to the tasks.
 
				 
			
 
				 @end table
			
 
				 
			
 
				-Additionally, the following attributes offer a simple way to allocate
			
 
				-and register storage for arrays:
			
 
				+Additionally, the @code{heap_allocated} variable attribute offers a
			
 
				+simple way to allocate storage for arrays on the heap:
			
 
				 
			
 
				 @table @code
			
 
				 
			
 
				-@item registered
			
 
				-@cindex @code{registered} attribute
			
 
				-This attributes applies to local variables with an array type.  Its
			
 
				-effect is to automatically register the array's storage, as per
			
 
				-@code{#pragma starpu register}.  The array is automatically unregistered
			
 
				-when the variable's scope is left.  This attribute is typically used in
			
 
				-conjunction with the @code{heap_allocated} attribute, described below.
			
 
				-
			
 
				 @item heap_allocated
			
 
				 @cindex @code{heap_allocated} attribute
			
 
				 This attributes applies to local variables with an array type.  Its
			
 
				 effect is to automatically allocate the array's storage on
			
 
				 the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
			
 
				-Management API, starpu_malloc}).  The heap-allocated array is automatically
			
 
				+Library API, starpu_malloc}).  The heap-allocated array is automatically
			
 
				 freed when the variable's scope is left, as with
			
 
				 automatic variables.
			
 
				 
			
@@ -360,13 +351,16 @@ main (int argc, char *argv[])
 
				 
			
 
				   @{
			
 
				     float matrix[nblocks][nblocks][size]
			
 
				-      __attribute__ ((heap_allocated, registered));
			
 
				+      __attribute__ ((heap_allocated));
			
 
				+
			
 
				+#pragma starpu register matrix
			
 
				 
			
 
				     cholesky (nblocks, size, matrix);
			
 
				 
			
 
				 #pragma starpu wait
			
 
				+#pragma starpu unregister matrix
			
 
				 
			
 
				-  @}   /* MATRIX is automatically unregistered & freed here.  */
			
 
				+  @}   /* MATRIX is automatically freed here.  */
			
 
				 
			
 
				 #pragma starpu shutdown
			
 
				 
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -19,7 +19,6 @@ The following arguments can be given to the @code{configure} script.
 
				 @menu
			
 
				 * Common configuration::        
			
 
				 * Configuring workers::         
			
 
				-* Extension configuration::     
			
 
				 * Advanced configuration::      
			
 
				 @end menu
			
 
				 
			
@@ -123,47 +122,6 @@ Allow for at most @var{count} scheduling contexts
 
				 This information is then available as the
			
 
				 @code{STARPU_NMAX_SCHED_CTXS} macro.
			
 
				 
			
 
				-@item --disable-asynchronous-copy
			
 
				-Disable asynchronous copies between CPU and GPU devices.
			
 
				-The AMD implementation of OpenCL is known to
			
 
				-fail when copying data asynchronously. When using this implementation,
			
 
				-it is therefore necessary to disable asynchronous data transfers.
			
 
				-
			
 
				-@item --disable-asynchronous-cuda-copy
			
 
				-Disable asynchronous copies between CPU and CUDA devices.
			
 
				-
			
 
				-@item --disable-asynchronous-opencl-copy
			
 
				-Disable asynchronous copies between CPU and OpenCL devices.
			
 
				-The AMD implementation of OpenCL is known to
			
 
				-fail when copying data asynchronously. When using this implementation,
			
 
				-it is therefore necessary to disable asynchronous data transfers.
			
 
				-@end table
			
 
				-
			
 
				-@node Extension configuration
			
 
				-@subsection Extension configuration
			
 
				-
			
 
				-@table @code
			
 
				-
			
 
				-@item --disable-socl
			
 
				-Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
			
 
				-default, it is enabled when an OpenCL implementation is found.
			
 
				-
			
 
				-@item --disable-starpu-top
			
 
				-Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
			
 
				-is enabled when the required dependencies are found.
			
 
				-
			
 
				-@item --disable-gcc-extensions
			
 
				-Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
			
 
				-enabled when the GCC compiler provides a plug-in support.
			
 
				-
			
 
				-@item --with-mpicc=@var{path}
			
 
				-Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				-(@pxref{StarPU MPI support}).
			
 
				-
			
 
				-@item --enable-comm-stats
			
 
				-Enable communication statistics for starpumpi (@pxref{StarPU MPI
			
 
				-support}).
			
 
				-
			
 
				 @end table
			
 
				 
			
 
				 @node Advanced configuration
			
@@ -215,6 +173,10 @@ notably contain @code{include/fxt/fxt.h}.
 
				 Store performance models under @var{dir}, instead of the current user's
			
 
				 home.
			
 
				 
			
 
				+@item --with-mpicc=@var{path}
			
 
				+Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				+(@pxref{StarPU MPI support}).
			
 
				+
			
 
				 @item --with-goto-dir=@var{prefix}
			
 
				 Search for GotoBLAS under @var{prefix}.
			
 
				 
			
@@ -231,14 +193,23 @@ that the
 
				 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
			
 
				 MKL website} provides a script to determine the linking flags.
			
 
				 
			
 
				+@item --disable-gcc-extensions
			
 
				+Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
			
 
				+enabled when the GCC compiler provides a plug-in support.
			
 
				+
			
 
				+@item --disable-socl
			
 
				+Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
			
 
				+default, it is enabled when an OpenCL implementation is found.
			
 
				+
			
 
				+@item --disable-starpu-top
			
 
				+Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
			
 
				+is enabled when the required dependencies are found.
			
 
				+
			
 
				 @item --enable-sched-ctx-hypervisor
			
 
				 Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}). 
			
 
				 By default, it is disabled.
			
 
				 
			
 
				-@item --disable-build-examples
			
 
				-Disable the build of examples.
			
 
				 @end table
			
 
				-
			
 
				 @node Execution configuration through environment variables
			
 
				 @section Execution configuration through environment variables
			
 
				 
			
@@ -248,34 +219,59 @@ Disable the build of examples.
 
				 * Misc::                        Miscellaneous and debug
			
 
				 @end menu
			
 
				 
			
 
				+Note: the values given in @code{starpu_conf} structure passed when
			
 
				+calling @code{starpu_init} will override the values of the environment
			
 
				+variables.
			
 
				+
			
 
				 @node Workers
			
 
				 @subsection Configuring workers
			
 
				 
			
 
				-@table @code
			
 
				+@menu
			
 
				+* STARPU_NCPUS::                Number of CPU workers
			
 
				+* STARPU_NCUDA::                Number of CUDA workers
			
 
				+* STARPU_NOPENCL::              Number of OpenCL workers
			
 
				+* STARPU_NGORDON::              Number of SPU workers (Cell)
			
 
				+* STARPU_WORKERS_NOBIND::       Do not bind workers
			
 
				+* STARPU_WORKERS_CPUID::        Bind workers to specific CPUs
			
 
				+* STARPU_WORKERS_CUDAID::       Select specific CUDA devices
			
 
				+* STARPU_WORKERS_OPENCLID::     Select specific OpenCL devices
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_NCPUS
			
 
				+@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
			
 
				 
			
 
				-@item @code{STARPU_NCPU}
			
 
				 Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
			
 
				 more CPU workers than there are physical CPUs, and that some CPUs are used to control
			
 
				 the accelerators.
			
 
				 
			
 
				-@item @code{STARPU_NCUDA}
			
 
				+@node STARPU_NCUDA
			
 
				+@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
			
 
				+
			
 
				 Specify the number of CUDA devices that StarPU can use. If
			
 
				 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
			
 
				 possible to select which CUDA devices should be used by the means of the
			
 
				 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
			
 
				 create as many CUDA workers as there are CUDA devices.
			
 
				 
			
 
				-@item @code{STARPU_NOPENCL}
			
 
				+@node STARPU_NOPENCL
			
 
				+@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
			
 
				+
			
 
				 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
			
 
				 
			
 
				-@item @code{STARPU_NGORDON}
			
 
				+@node STARPU_NGORDON
			
 
				+@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
			
 
				+
			
 
				 Specify the number of SPUs that StarPU can use.
			
 
				 
			
 
				-@item @code{STARPU_WORKERS_NOBIND}
			
 
				+@node STARPU_WORKERS_NOBIND
			
 
				+@subsubsection @code{STARPU_WORKERS_NOBIND} -- Do not bind workers to specific CPUs
			
 
				+
			
 
				 Setting it to non-zero will prevent StarPU from binding its threads to
			
 
				 CPUs. This is for instance useful when running the testsuite in parallel.
			
 
				 
			
 
				-@item @code{STARPU_WORKERS_CPUID}
			
 
				+@node STARPU_WORKERS_CPUID
			
 
				+@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
			
 
				+
			
 
				 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
			
 
				 specifies on which logical CPU the different workers should be
			
 
				 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
			
@@ -286,7 +282,7 @@ available.
 
				 
			
 
				 Note that the first workers correspond to the CUDA workers, then come the
			
 
				 OpenCL and the SPU, and finally the CPU workers. For example if
			
 
				-we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
			
 
				+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
			
 
				 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
			
 
				 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
			
 
				 the logical CPUs #1 and #3 will be used by the CPU workers.
			
@@ -299,7 +295,9 @@ third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
 
				 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
			
 
				 @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				 
			
 
				-@item @code{STARPU_WORKERS_CUDAID}
			
 
				+@node STARPU_WORKERS_CUDAID
			
 
				+@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
			
 
				+
			
 
				 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
			
 
				 possible to select which CUDA devices should be used by StarPU. On a machine
			
 
				 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
			
@@ -310,61 +308,36 @@ the one reported by CUDA).
 
				 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
			
 
				 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				 
			
 
				-@item @code{STARPU_WORKERS_OPENCLID}
			
 
				+@node STARPU_WORKERS_OPENCLID
			
 
				+@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
			
 
				+
			
 
				 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
			
 
				 
			
 
				 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
			
 
				 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				 
			
 
				-@item @code{STARPU_SINGLE_COMBINED_WORKER}
			
 
				-If set, StarPU will create several workers which won't be able to work
			
 
				-concurrently. It will create combined workers which size goes from 1 to the
			
 
				-total number of CPU workers in the system.
			
 
				-
			
 
				-@item @code{SYNTHESIZE_ARITY_COMBINED_WORKER}
			
 
				-
			
 
				-@item @code{STARPU_MIN_WORKERSIZE}
			
 
				-Let the user give a hint to StarPU about which how many workers
			
 
				-(minimum boundary) the combined workers should contain.
			
 
				-
			
 
				-@item @code{STARPU_MAX_WORKERSIZE}
			
 
				-Let the user give a hint to StarPU about which how many workers
			
 
				-(maximum boundary) the combined workers should contain.
			
 
				-
			
 
				-@item @code{STARPU_DISABLE_ASYNCHRONOUS_COPY}
			
 
				-Disable asynchronous copies between CPU and GPU devices.
			
 
				-The AMD implementation of OpenCL is known to
			
 
				-fail when copying data asynchronously. When using this implementation,
			
 
				-it is therefore necessary to disable asynchronous data transfers.
			
 
				-
			
 
				-@item @code{STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY}
			
 
				-Disable asynchronous copies between CPU and CUDA devices.
			
 
				-
			
 
				-@item @code{STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY}
			
 
				-Disable asynchronous copies between CPU and OpenCL devices.
			
 
				-The AMD implementation of OpenCL is known to
			
 
				-fail when copying data asynchronously. When using this implementation,
			
 
				-it is therefore necessary to disable asynchronous data transfers.
			
 
				-
			
 
				-@item @code{STARPU_DISABLE_CUDA_GPU_GPU_DIRECT}
			
 
				-Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
			
 
				-instead. This permits to test the performance effect of GPU-Direct.
			
 
				-
			
 
				-@end table
			
 
				-
			
 
				->>>>>>> .merge-right.r7182
			
 
				 @node Scheduling
			
 
				 @subsection Configuring the Scheduling engine
			
 
				 
			
 
				-@table @code
			
 
				+@menu
			
 
				+* STARPU_SCHED::                Scheduling policy
			
 
				+* STARPU_CALIBRATE::            Calibrate performance models
			
 
				+* STARPU_PREFETCH::             Use data prefetch
			
 
				+* STARPU_SCHED_ALPHA::          Computation factor
			
 
				+* STARPU_SCHED_BETA::           Communication factor
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SCHED
			
 
				+@subsubsection @code{STARPU_SCHED} -- Scheduling policy
			
 
				 
			
 
				-@item @code{STARPU_SCHED}
			
 
				 Choose between the different scheduling policies proposed by StarPU: work
			
 
				 random, stealing, greedy, with performance models, etc.
			
 
				 
			
 
				 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
			
 
				 
			
 
				-@item @code{STARPU_CALIBRATE}
			
 
				+@node STARPU_CALIBRATE
			
 
				+@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
			
 
				+
			
 
				 If this variable is set to 1, the performance models are calibrated during
			
 
				 the execution. If it is set to 2, the previous values are dropped to restart
			
 
				 calibration from scratch. Setting this variable to 0 disable calibration, this
			
@@ -372,11 +345,9 @@ is the default behaviour.
 
				 
			
 
				 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
			
 
				 
			
 
				-@item @code{STARPU_BUS_CALIBRATE}
			
 
				-If this variable is set to 1, the bus is recalibrated during intialization.
			
 
				+@node STARPU_PREFETCH
			
 
				+@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
			
 
				 
			
 
				-@item @code{STARPU_PREFETCH}
			
 
				-@anchor{STARPU_PREFETCH}
			
 
				 This variable indicates whether data prefetching should be enabled (0 means
			
 
				 that it is disabled). If prefetching is enabled, when a task is scheduled to be
			
 
				 executed e.g. on a GPU, StarPU will request an asynchronous transfer in
			
@@ -384,42 +355,58 @@ advance, so that data is already present on the GPU when the task starts. As a
 
				 result, computation and data transfers are overlapped.
			
 
				 Note that prefetching is enabled by default in StarPU.
			
 
				 
			
 
				-@item @code{STARPU_SCHED_ALPHA}
			
 
				+@node STARPU_SCHED_ALPHA
			
 
				+@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
			
 
				+
			
 
				 To estimate the cost of a task StarPU takes into account the estimated
			
 
				 computation time (obtained thanks to performance models). The alpha factor is
			
 
				 the coefficient to be applied to it before adding it to the communication part.
			
 
				 
			
 
				-@item @code{STARPU_SCHED_BETA}
			
 
				+@node STARPU_SCHED_BETA
			
 
				+@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
			
 
				+
			
 
				 To estimate the cost of a task StarPU takes into account the estimated
			
 
				 data transfer time (obtained thanks to performance models). The beta factor is
			
 
				 the coefficient to be applied to it before adding it to the computation part.
			
 
				 
			
 
				-@end table
			
 
				-
			
 
				 @node Misc
			
 
				 @subsection Miscellaneous and debug
			
 
				 
			
 
				-@table @code
			
 
				+@menu
			
 
				+* STARPU_SILENT::               Disable verbose mode
			
 
				+* STARPU_LOGFILENAME::          Select debug file name
			
 
				+* STARPU_FXT_PREFIX::           FxT trace location
			
 
				+* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
			
 
				+* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SILENT
			
 
				+@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
			
 
				 
			
 
				-@item @code{STARPU_SILENT}
			
 
				 This variable allows to disable verbose mode at runtime when StarPU
			
 
				 has been configured with the option @code{--enable-verbose}.
			
 
				 
			
 
				-@item @code{STARPU_LOGFILENAME}
			
 
				+@node STARPU_LOGFILENAME
			
 
				+@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
			
 
				+
			
 
				 This variable specifies in which file the debugging output should be saved to.
			
 
				 
			
 
				-@item @code{STARPU_FXT_PREFIX}
			
 
				+@node STARPU_FXT_PREFIX
			
 
				+@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
			
 
				+
			
 
				 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
			
 
				 
			
 
				-@item @code{STARPU_LIMIT_GPU_MEM}
			
 
				+@node STARPU_LIMIT_GPU_MEM
			
 
				+@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
			
 
				+
			
 
				 This variable specifies the maximum number of megabytes that should be
			
 
				 available to the application on each GPUs. In case this value is smaller than
			
 
				 the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
			
 
				 on the device. This variable is intended to be used for experimental purposes
			
 
				 as it emulates devices that have a limited amount of memory.
			
 
				 
			
 
				-@item @code{STARPU_GENERATE_TRACE}
			
 
				+@node STARPU_GENERATE_TRACE
			
 
				+@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
			
 
				+
			
 
				 When set to 1, this variable indicates that StarPU should automatically
			
 
				 generate a Paje trace when starpu_shutdown is called.
			
 
				-
			
 
				-@end table
			
--- a/doc/chapters/mpi-support.texi
+++ b/doc/chapters/mpi-support.texi
@@ -20,11 +20,10 @@ distributed application, by automatically issuing all required data transfers
 
				 according to the task graph and an application-provided distribution.
			
 
				 
			
 
				 @menu
			
 
				-* The API::                     
			
 
				-* Simple Example::              
			
 
				-* Exchanging User Defined Data Interface::  
			
 
				-* MPI Insert Task Utility::     
			
 
				-* MPI Collective Operations::   
			
 
				+* The API::
			
 
				+* Simple Example::
			
 
				+* MPI Insert Task Utility::
			
 
				+* MPI Collective Operations::
			
 
				 @end menu
			
 
				 
			
 
				 @node The API
			
@@ -141,20 +140,17 @@ communicator @var{comm}. On completion, @var{tag} is unlocked.
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
			
 
				-Posts @var{array_size} standard-mode, non blocking send. Each post
			
 
				-sends the n-th data of the array @var{data_handle} to the n-th node of
			
 
				-the array @var{dest}
			
 
				-using the n-th message tag of the array @code{mpi_tag} within the n-th
			
 
				-communicator of the array
			
 
				-@var{comm}. On completion of the all the requests, @var{tag} is unlocked.
			
 
				+Posts @var{array_size} standard-mode, non blocking send of the data of
			
 
				+data @var{data_handle[x]} to the node @var{dest[x]} using the message
			
 
				+tag @code{mpi_tag[x]} within the communicator @var{comm[x]}. On
			
 
				+completion of the all the requests, @var{tag} is unlocked.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
			
 
				-Posts @var{array_size} nonblocking receive. Each post receives in the
			
 
				-n-th data of the array @var{data_handle} from the n-th
			
 
				-node of the array @var{source} using the n-th message tag of the array
			
 
				-@code{mpi_tag} within the n-th communicator of the array @var{comm}.
			
 
				-On completion of the all the requests, @var{tag} is unlocked.
			
 
				+Posts @var{array_size} nonblocking receive in @var{data_handle[x]} from the
			
 
				+node @var{source[x]} using the message tag @code{mpi_tag[x]} within the
			
 
				+communicator @var{comm[x]}. On completion of the all the requests,
			
 
				+@var{tag} is unlocked.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @page
			
@@ -243,79 +239,16 @@ int main(int argc, char **argv)
 
				 @end cartouche
			
 
				 
			
 
				 @page
			
 
				-@node Exchanging User Defined Data Interface
			
 
				-@section Exchanging User Defined Data Interface
			
 
				-
			
 
				-New data interfaces defined as explained in @ref{An example
			
 
				-of data interface} can also be used within StarPU-MPI and exchanged
			
 
				-between nodes. Two functions needs to be defined through
			
 
				-the type @code{struct starpu_data_interface_ops} (@pxref{Data
			
 
				-Interface API}). The pack function takes a handle and returns a
			
 
				-contiguous memory buffer where data to be conveyed to another node
			
 
				-should be copied. The reversed operation is implemented in the unpack
			
 
				-function which takes a contiguous memory buffer and recreates the data
			
 
				-handle.
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
			
 
				-@{
			
 
				-  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-  struct starpu_complex_interface *complex_interface =
			
 
				-    (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-  *ptr = malloc(complex_get_size(handle));
			
 
				-  memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
			
 
				-  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
			
 
				-         complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-  return 0;
			
 
				-@}
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
			
 
				-@{
			
 
				-  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-  struct starpu_complex_interface *complex_interface =
			
 
				-    (struct starpu_complex_interface *)	starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-  memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
			
 
				-  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double),
			
 
				-         complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-  return 0;
			
 
				-@}
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-static struct starpu_data_interface_ops interface_complex_ops =
			
 
				-@{
			
 
				-  ...
			
 
				-  .pack_data = complex_pack_data,
			
 
				-  .unpack_data = complex_unpack_data
			
 
				-@};
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-@page
			
 
				 @node MPI Insert Task Utility
			
 
				 @section MPI Insert Task Utility
			
 
				 
			
 
				 To save the programmer from having to explicit all communications, StarPU
			
 
				 provides an "MPI Insert Task Utility". The principe is that the application
			
 
				 decides a distribution of the data over the MPI nodes by allocating it and
			
 
				-notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns"
			
 
				-which data. It also decides, for each handle, an MPI tag which will be used to
			
 
				-exchange the content of the handle. All MPI nodes then process the whole task
			
 
				-graph, and StarPU automatically determines which node actually execute which
			
 
				-task, and trigger the required MPI transfers.
			
 
				+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
			
 
				+data. All MPI nodes then process the whole task graph, and StarPU automatically
			
 
				+determines which node actually execute which task, as well as the required MPI
			
 
				+transfers.
			
 
				 
			
 
				 @deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
			
 
				 Tell StarPU-MPI which MPI tag to use when exchanging the data.
			
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -64,7 +64,7 @@ function.
 
				 It it worth noting that the application may directly access this structure from
			
 
				 the callback executed at the end of the task. The @code{starpu_task} structure
			
 
				 associated to the callback currently being executed is indeed accessible with
			
 
				-the @code{starpu_task_get_current()} function.
			
 
				+the @code{starpu_get_current_task()} function.
			
 
				 
			
 
				 @node Codelet feedback
			
 
				 @subsection Per-codelet feedback
			
@@ -383,7 +383,7 @@ performance models. It also writes a @code{.gp} file in the current directory,
 
				 to be run in the @code{gnuplot} tool, which shows the corresponding curve.
			
 
				 
			
 
				 The same can also be achieved by using StarPU's library API, see
			
 
				-@ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
			
 
				+@ref{Performance Model API} and notably the @code{starpu_load_history_debug}
			
 
				 function. The source code of the @code{starpu_perfmodel_display} tool can be a
			
 
				 useful example.
			
 
				 
			
@@ -415,7 +415,7 @@ Print the DAG that was recorded
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_bound_compute ({double *}@var{res}, {double *}@var{integer_res}, int @var{integer})
			
 
				-Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script). It returns 0 if some performance models are not calibrated.
			
 
				+Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script)
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_bound_print_lp ({FILE *}@var{output})
			
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -80,42 +80,6 @@ In the same vein, accumulation of results in the same data can become a
 
				 bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
			
 
				 accumulation (@pxref{Data reduction}).
			
 
				 
			
 
				-Applications often need a data just for temporary results.  In such a case,
			
 
				-registration can be made without an initial value, for instance this produces a vector data:
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-StarPU will then allocate the actual buffer only when it is actually needed,
			
 
				-e.g. directly on the GPU without allocating in main memory.
			
 
				-
			
 
				-In the same vein, once the temporary results are not useful any more, the
			
 
				-data should be thrown away. If the handle is not to be reused, it can be
			
 
				-unregistered:
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-starpu_unregister_submit(handle);
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-actual unregistration will be done after all tasks working on the handle
			
 
				-terminate.
			
 
				-
			
 
				-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
			
 
				-
			
 
				-@cartouche
			
 
				-@smallexample
			
 
				-starpu_invalidate_submit(handle);
			
 
				-@end smallexample
			
 
				-@end cartouche
			
 
				-
			
 
				-the buffers containing the current value will then be freed, and reallocated
			
 
				-only when another task writes some value to the handle.
			
 
				-
			
 
				 @node Task granularity
			
 
				 @section Task granularity
			
 
				 
			
@@ -196,11 +160,11 @@ supports parallel tasks (still experimental).
 
				 
			
 
				 @node Task scheduling contexts
			
 
				 @section Task scheduling contexts
			
 
				-Task scheduling contexts represent abstracts sets of workers that allow programmers to control the distribution of computational resources (i.e. CPUs and
			
 
				+Task scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
			
 
				 GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
			
 
				 
			
 
				-By default, the application submits tasks to an initial context, which uses the computation ressources available to StarPU (all the workers). 
			
 
				-If the application programmer plans to launch several parallel kernels simultaneously, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
			
 
				+By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
			
 
				+If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
			
 
				 Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
			
 
				 These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
			
 
				 In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
			
@@ -280,46 +244,6 @@ disables data transfer / computation overlapping, and should thus not be used
 
				 for eventual benchmarks. Note 2: history-based performance models get calibrated
			
 
				 only if a performance-model-based scheduler is chosen.
			
 
				 
			
 
				-The history-based performance models can also be explicitly filled by the
			
 
				-application without execution, if e.g. the application already has a series of
			
 
				-measurements. This can be done by using @code{starpu_perfmodel_update_history},
			
 
				-for instance:
			
 
				-
			
 
				-@example
			
 
				-static struct starpu_perfmodel perf_model = @{
			
 
				-    .type = STARPU_HISTORY_BASED,
			
 
				-    .symbol = "my_perfmodel",
			
 
				-@};
			
 
				-
			
 
				-struct starpu_codelet cl = @{
			
 
				-    .where = STARPU_CUDA,
			
 
				-    .cuda_funcs = @{ cuda_func1, cuda_func2, NULL @},
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = @{STARPU_W@},
			
 
				-    .model = &perf_model
			
 
				-@};
			
 
				-
			
 
				-void feed(void) @{
			
 
				-    struct my_measure *measure;
			
 
				-    struct starpu_task task;
			
 
				-    starpu_task_init(&task);
			
 
				-
			
 
				-    task.cl = &cl;
			
 
				-
			
 
				-    for (measure = &measures[0]; measure < measures[last]; measure++) @{
			
 
				-        starpu_data_handle_t handle;
			
 
				-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
			
 
				-	task.handles[0] = handle;
			
 
				-	starpu_perfmodel_update_history(&perf_model, &task, STARPU_CUDA_DEFAULT + measure->cudadev, 0, measure->implementation, measure->time);
			
 
				-	starpu_task_clean(&task);
			
 
				-	starpu_data_unregister(handle);
			
 
				-    @}
			
 
				-@}
			
 
				-@end example
			
 
				-
			
 
				-Measurement has to be provided in milliseconds for the completion time models,
			
 
				-and in Joules for the energy consumption models.
			
 
				-
			
 
				 @node Task distribution vs Data transfer
			
 
				 @section Task distribution vs Data transfer
			
 
				 
			
@@ -378,17 +302,6 @@ be obtained from the machine power supplier.
 
				 The power actually consumed by the total execution can be displayed by setting
			
 
				 @code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
			
 
				 
			
 
				-On-line task consumption measurement is currently only supported through the
			
 
				-@code{CL_PROFILING_POWER_CONSUMED} OpenCL extension, implemented in the MoviSim
			
 
				-simulator. Applications can however provide explicit measurements by using the
			
 
				-@code{starpu_perfmodel_update_history} function (examplified in @ref{Performance
			
 
				-model example} with the @code{power_model} performance model. Fine-grain
			
 
				-measurement is often not feasible with the feedback provided by the hardware, so
			
 
				-the user can for instance run a given task a thousand times, measure the global
			
 
				-consumption for that series of tasks, divide it by a thousand, repeat for
			
 
				-varying kinds of tasks and task sizes, and eventually feed StarPU
			
 
				-with these manual measurements through @code{starpu_perfmodel_update_history}.
			
 
				-
			
 
				 @node Profiling
			
 
				 @section Profiling
			
 
				 
			
--- a/doc/chapters/sched_ctx_hypervisor.texi
+++ b/doc/chapters/sched_ctx_hypervisor.texi
@@ -66,7 +66,7 @@ Allow resizing of a context
 
				 The user can then provide information to the hypervisor concerning the conditions of resizing.
			
 
				 
			
 
				 @deftypefun void sched_ctx_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
			
 
				-Inputs conditions to the context @var{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
			
 
				+Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
			
 
				 
			
 
				 @defmac HYPERVISOR_MAX_IDLE
			
 
				 This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments: 
			
@@ -257,11 +257,11 @@ Indicates the name of the policy, if there is not a custom policy, the policy co
 
				 @item @code{unsigned custom}
			
 
				 Indicates whether the policy is custom or not
			
 
				 @item @code{void (*handle_idle_cycle)(unsigned sched_ctx, int worker)}
			
 
				-It is called whenever the indicated worker executes another idle cycle in @var{sched_ctx}
			
 
				+It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
			
 
				 @item @code{void (*handle_pushed_task)(unsigned sched_ctx, int worker)}
			
 
				-It is called whenever a task is pushed on the worker's queue corresponding to the context @var{sched_ctx}
			
 
				+It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
			
 
				 @item @code{void (*handle_poped_task)(unsigned sched_ctx, int worker)}
			
 
				-It is called whenever a task is poped from the worker's queue corresponding to the context @var{sched_ctx}
			
 
				+It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
			
 
				 @item @code{void (*handle_idle_end)(unsigned sched_ctx, int worker)}
			
 
				 It is called whenever a task is executed on the indicated worker and context after a long period of idle time
			
 
				 @item @code{void (*handle_post_exec_hook)(unsigned sched_ctx, struct starpu_htbl32_node* resize_requests, int task_tag)}
			
@@ -358,7 +358,7 @@ Gets the number of contexts managed by the hypervisor
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun {struct sched_ctx_wrapper*} sched_ctx_hypervisor_get_wrapper (unsigned @var{sched_ctx});
			
 
				-Returns the wrapper corresponding the context @var{sched_ctx}
			
 
				+Returns the wrapper corresponding the context @code{sched_ctx}
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sched_ctx_wrapper*} @var{sc_w});
			
--- a/doc/chapters/vector_scal_c.texi
+++ b/doc/chapters/vector_scal_c.texi
@@ -14,6 +14,7 @@
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 #define    NX    2048
			
 
				 
			
--- a/doc/chapters/vector_scal_cuda.texi
+++ b/doc/chapters/vector_scal_cuda.texi
@@ -2,11 +2,12 @@
 
				 
			
 
				 @c This file is part of the StarPU Handbook.
			
 
				 @c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
--- a/doc/chapters/vector_scal_opencl.texi
+++ b/doc/chapters/vector_scal_opencl.texi
@@ -2,11 +2,12 @@
 
				 
			
 
				 @c This file is part of the StarPU Handbook.
			
 
				 @c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program programs;
			
 
				 
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -5,7 +5,7 @@
 
				 @settitle StarPU Handbook
			
 
				 @c %**end of header
			
 
				 
			
 
				-@include chapters/version.texi
			
 
				+@include version.texi
			
 
				 
			
 
				 @copying
			
 
				 Copyright @copyright{} 2009--2011  Universit@'e de Bordeaux 1
			
--- a/doc/tutorial/README
+++ b/doc/tutorial/README
@@ -41,6 +41,6 @@ Instructions on how to compile and run StarPU examples
 
				 % make vector_scal
			
 
				 % ./vector_scal
			
 
				 
			
 
				-% STARPU_NCPU=0 ./vector_scal
			
 
				-% STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal
			
 
				+% STARPU_NCPUS=0 ./vector_scal
			
 
				+% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
			
 
				 
			
--- a/doc/tutorial/vector_scal.c
+++ b/doc/tutorial/vector_scal.c
@@ -36,6 +36,7 @@
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 #define    NX    2048
			
 
				 
			
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * Redistribution  and  use  in  source and binary forms, with or without
			
@@ -29,6 +29,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
			
 
				 {
			
--- a/doc/tutorial/vector_scal_opencl.c
+++ b/doc/tutorial/vector_scal_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * Redistribution  and  use  in  source and binary forms, with or without
			
@@ -29,6 +29,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program programs;
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -115,8 +115,8 @@ noinst_HEADERS = 				\
 
				 	heat/dw_factolu.h			\
			
 
				 	lu/xlu.h				\
			
 
				 	lu/xlu_kernels.h			\
			
 
				-	lu/lu-float.h				\
			
 
				-	lu/lu-double.h				\
			
 
				+	lu/float.h				\
			
 
				+	lu/double.h				\
			
 
				 	lu/complex_float.h			\
			
 
				 	lu/complex_double.h			\
			
 
				 	lu/blas_complex.h			\
			
@@ -138,14 +138,12 @@ noinst_HEADERS = 				\
 
				 	filters/custom_mf/custom_interface.h    \
			
 
				 	filters/custom_mf/custom_types.h	\
			
 
				 	interface/complex_interface.h		\
			
 
				-	interface/complex_codelet.h		\
			
 
				 	pi/pi.h					\
			
 
				 	pi/SobolQRNG/sobol.h			\
			
 
				 	pi/SobolQRNG/sobol_gold.h		\
			
 
				 	pi/SobolQRNG/sobol_gpu.h		\
			
 
				 	pi/SobolQRNG/sobol_primitives.h         \
			
 
				-	reductions/dot_product.h                \
			
 
				-	basic_examples/vector_scal_cpu_template.h
			
 
				+	reductions/dot_product.h
			
 
				 
			
 
				 #####################################
			
 
				 # What to install and what to check #
			
@@ -168,16 +166,9 @@ LOADER			=	loader
 
				 loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
			
 
				 loader_SOURCES		=	../tests/loader.c
			
 
				-
			
 
				-if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER		=	$(LOADER_BIN)
			
 
				-else
			
 
				 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				-endif
			
 
				-
			
 
				 examplebin_PROGRAMS +=				\
			
 
				 	basic_examples/hello_world		\
			
 
				 	basic_examples/vector_scal		\
			
@@ -190,9 +181,6 @@ examplebin_PROGRAMS +=				\
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
 
				-	filters/shadow				\
			
 
				-	filters/shadow2d			\
			
 
				-	filters/shadow3d			\
			
 
				 	tag_example/tag_example			\
			
 
				 	tag_example/tag_example2		\
			
 
				 	tag_example/tag_example3		\
			
@@ -202,13 +190,13 @@ examplebin_PROGRAMS +=				\
 
				 	spmv/spmv				\
			
 
				 	callback/callback			\
			
 
				 	incrementer/incrementer			\
			
 
				-	binary/binary				\
			
 
				 	interface/complex			\
			
 
				 	matvecmult/matvecmult			\
			
 
				 	profiling/profiling			\
			
 
				 	scheduler/dummy_sched			\
			
 
				 	reductions/dot_product			\
			
 
				 	reductions/minmax_reduction		\
			
 
				+	mandelbrot/mandelbrot			\
			
 
				 	ppm_downscaler/ppm_downscaler		\
			
 
				 	ppm_downscaler/yuv_downscaler
			
 
				 
			
@@ -231,8 +219,7 @@ examplebin_PROGRAMS +=				\
 
				 	lu/lu_implicit_example_float		\
			
 
				 	lu/lu_implicit_example_double		\
			
 
				 	heat/heat				\
			
 
				-	cg/cg					\
			
 
				-	pipeline/pipeline
			
 
				+	cg/cg
			
 
				 endif
			
 
				 
			
 
				 if MKL_BLAS_LIB
			
@@ -268,7 +255,6 @@ STARPU_EXAMPLES +=				\
 
				 	spmv/spmv				\
			
 
				 	callback/callback			\
			
 
				 	incrementer/incrementer			\
			
 
				-	binary/binary				\
			
 
				 	interface/complex			\
			
 
				 	matvecmult/matvecmult			\
			
 
				 	profiling/profiling			\
			
@@ -295,8 +281,7 @@ STARPU_EXAMPLES +=				\
 
				 	lu/lu_implicit_example_float		\
			
 
				 	lu/lu_implicit_example_double		\
			
 
				 	heat/heat				\
			
 
				-	cg/cg					\
			
 
				-	pipeline/pipeline
			
 
				+	cg/cg
			
 
				 endif
			
 
				 
			
 
				 if MKL_BLAS_LIB
			
@@ -323,6 +308,7 @@ basic_examples_vector_scal_SOURCES =		\
 
				 if STARPU_HAVE_ICC
			
 
				 basic_examples_vector_scal_SOURCES +=		\
			
 
				 	basic_examples/vector_scal_cpu_icc.icc
			
 
				+basic_examples/vector_scal_cpu_icc.o: CFLAGS += -Dscal_cpu_func=scal_cpu_func_icc -Dscal_sse_func=scal_sse_func_icc
			
 
				 endif
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
@@ -747,17 +733,6 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
				 	incrementer/incrementer_kernels_opencl_kernel.cl
			
 
				 endif
			
 
				 
			
 
				-##################
			
 
				-# Binary example #
			
 
				-##################
			
 
				-
			
 
				-binary_binary_SOURCES =	\
			
 
				-	binary/binary.c
			
 
				-if STARPU_USE_OPENCL
			
 
				-binary_binary_SOURCES +=	\
			
 
				-	incrementer/incrementer_kernels_opencl.c
			
 
				-endif
			
 
				-
			
 
				 #####################
			
 
				 # interface example #
			
 
				 #####################
			
@@ -805,6 +780,12 @@ endif
 
				 # Mandelbrot Set #
			
 
				 ##################
			
 
				 
			
 
				+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
			
 
				+if HAVE_X11
			
 
				+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
			
 
				+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
			
 
				+endif
			
 
				+
			
 
				 ################
			
 
				 # Top Examples #
			
 
				 ################
			
@@ -848,41 +829,6 @@ pi_pi_redux_LDADD =				\
 
				 	$(STARPU_CURAND_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				-###########################
			
 
				-# OpenGL interoperability #
			
 
				-###########################
			
 
				-
			
 
				-if HAVE_OPENGL
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	gl_interop/gl_interop			\
			
 
				-	gl_interop/gl_interop_idle
			
 
				-
			
 
				-gl_interop_gl_interop_SOURCES =			\
			
 
				-	gl_interop/gl_interop.c
			
 
				-
			
 
				-gl_interop_gl_interop_LDADD =			\
			
 
				-	$(STARPU_OPENGL_RENDER_LDFLAGS)
			
 
				-
			
 
				-gl_interop_gl_interop_idle_SOURCES =		\
			
 
				-	gl_interop/gl_interop_idle.c
			
 
				-
			
 
				-gl_interop_gl_interop_idle_LDADD =		\
			
 
				-	$(STARPU_OPENGL_RENDER_LDFLAGS)
			
 
				-endif
			
 
				-
			
 
				-####################
			
 
				-# pipeline example #
			
 
				-####################
			
 
				-
			
 
				-if !NO_BLAS_LIB
			
 
				-pipeline_pipeline_SOURCES	=	\
			
 
				-	pipeline/pipeline.c		\
			
 
				-	common/blas.c
			
 
				-
			
 
				-pipeline_pipeline_LDADD =		\
			
 
				-	$(STARPU_BLAS_LDFLAGS)
			
 
				-endif
			
 
				-
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
 
				 	for i in $(SUBDIRS) ; do \
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -179,11 +179,9 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
				 	{
			
 
				 		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				-		cufftSetStream(plans[workerid].plan, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				-		cufftSetStream(plans[workerid].inv_plan, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		cudaMalloc((void **)&plans[workerid].localout,
			
 
				 					nsamples*sizeof(cufftComplex));
			
@@ -200,11 +198,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
				 	
			
 
				 	/* filter low freqs */
			
 
				 	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
			
 
				-	cudaMemsetAsync(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
			
 
				+	cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
			
 
				 
			
 
				 	/* filter high freqs */
			
 
				 	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
			
 
				-	cudaMemsetAsync(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
			
 
				+	cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
			
 
				 
			
 
				 	/* inverse FFT */
			
 
				 	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
			
@@ -212,7 +210,6 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
				 
			
 
				 	/* FFTW does not normalize its output ! */
			
 
				 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -413,8 +410,6 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
			
 
				 
			
 
				 	struct starpu_data_filter f =
			
@@ -463,8 +458,6 @@ int main(int argc, char **argv)
 
				 	starpu_data_unpartition(A_handle, 0);
			
 
				 	starpu_data_unregister(A_handle);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-
			
 
				 	/* we are done ! */
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -28,6 +28,9 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cublas.h>
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 #include "axpy.h"
			
 
				 
			
@@ -71,7 +74,7 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 
				 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 
			
 
				 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/axpy/axpy_opencl.c
+++ b/examples/axpy/axpy_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include "axpy.h"
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include <pthread.h>
			
 
				 #include <math.h>
			
 
				 
			
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
			
 
				 {
			
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 #define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
			
 
				 do						    	    \
			
--- a/examples/basic_examples/multiformat.c
+++ b/examples/basic_examples/multiformat.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				 #include "multiformat_types.h"
			
 
				 
			
 
				 static int ncpu = 0;
			
--- a/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
+++ b/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include "multiformat_types.h"
			
 
				 
			
 
				 static __global__ void cpu_to_cuda_cuda(struct point *src,
			
--- a/examples/basic_examples/multiformat_conversion_codelets_opencl.c
+++ b/examples/basic_examples/multiformat_conversion_codelets_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_conversion_program;
			
 
				 
			
--- a/examples/basic_examples/multiformat_cuda.cu
+++ b/examples/basic_examples/multiformat_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include "multiformat_types.h"
			
 
				 
			
 
				 static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)
			
--- a/examples/basic_examples/multiformat_opencl.c
+++ b/examples/basic_examples/multiformat_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 
			
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -29,6 +29,7 @@ extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				 extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				 struct starpu_opencl_program opencl_program;
			
 
				 #endif
			
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_variable(float * tab)
			
 
				 {
			
--- a/examples/basic_examples/variable_kernels_opencl.c
+++ b/examples/basic_examples/variable_kernels_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 void opencl_codelet(void *descr[], void *_args)
			
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,8 +23,8 @@
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 
			
 
				-#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdio.h>
			
 
				 #include <math.h>
			
@@ -66,7 +66,6 @@ static struct starpu_codelet cl =
 
				 		, scal_sse_func_icc
			
 
				 #endif
			
 
				 #endif
			
 
				-		, NULL
			
 
				 	},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* CUDA implementation of the codelet */
			
@@ -88,15 +87,8 @@ struct starpu_opencl_program opencl_program;
 
				 
			
 
				 static int approximately_equal(float a, float b)
			
 
				 {
			
 
				-#ifdef STARPU_HAVE_NEARBYINTF
			
 
				 	int ai = (int) nearbyintf(a * 1000.0);
			
 
				 	int bi = (int) nearbyintf(b * 1000.0);
			
 
				-#elif defined(STARPU_HAVE_RINTF)
			
 
				-	int ai = (int) rintf(a * 1000.0);
			
 
				-	int bi = (int) rintf(b * 1000.0);
			
 
				-#else
			
 
				-#error "Please define either nearbyintf or rintf."
			
 
				-#endif
			
 
				 	return ai == bi;
			
 
				 }
			
 
				 
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -26,6 +26,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include <stdio.h>
			
 
				 
			
 
				 
			
--- a/examples/basic_examples/vector_scal_cpu.c
+++ b/examples/basic_examples/vector_scal_cpu.c
@@ -18,8 +18,67 @@
 
				  * This example complements vector_scale.c: here we implement a CPU version.
			
 
				  */
			
 
				 
			
 
				-#include "vector_scal_cpu_template.h"
			
 
				+#include <starpu.h>
			
 
				+#ifdef __SSE__
			
 
				+#include <xmmintrin.h>
			
 
				+#endif
			
 
				 
			
 
				-VECTOR_SCAL_CPU_FUNC(scal_cpu_func)
			
 
				-VECTOR_SCAL_SSE_FUNC(scal_sse_func)
			
 
				+/* This kernel takes a buffer and scales it by a constant factor */
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	float *factor = (float *) cl_arg;
			
 
				 
			
 
				+	/*
			
 
				+	 * The "buffers" array matches the task->handles array: for instance
			
 
				+	 * task->handles[0] is a handle that corresponds to a data with
			
 
				+	 * vector "interface", so that the first entry of the array in the
			
 
				+	 * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				+	 * struct starpu_vector_interface *). Here, we therefore manipulate
			
 
				+	 * the buffers[0] element as a vector: nx gives the number of elements
			
 
				+	 * in the array, ptr gives the location of the array (that was possibly
			
 
				+	 * migrated/replicated), and elemsize gives the size of each elements.
			
 
				+	 */
			
 
				+
			
 
				+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
			
 
				+
			
 
				+	/* length of the vector */
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+
			
 
				+	/* get a pointer to the local copy of the vector : note that we have to
			
 
				+	 * cast it in (float *) since a vector could contain any type of
			
 
				+	 * elements so that the .ptr field is actually a uintptr_t */
			
 
				+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+	/* scale the vector */
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		val[i] *= *factor;
			
 
				+}
			
 
				+
			
 
				+#ifdef __SSE__
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+	unsigned int n_iterations = n/4;
			
 
				+
			
 
				+	__m128 *VECTOR = (__m128*) vector;
			
 
				+	__m128 FACTOR __attribute__((aligned(16)));
			
 
				+	float factor = *(float *) cl_arg;
			
 
				+	FACTOR = _mm_set1_ps(factor);
			
 
				+
			
 
				+	unsigned int i;	
			
 
				+	for (i = 0; i < n_iterations; i++)
			
 
				+		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
			
 
				+
			
 
				+	unsigned int remainder = n%4;
			
 
				+	if (remainder != 0)
			
 
				+	{
			
 
				+		unsigned int start = 4 * n_iterations;
			
 
				+		for (i = start; i < start+remainder; ++i)
			
 
				+		{
			
 
				+			vector[i] = factor * vector[i];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
--- a/examples/basic_examples/vector_scal_cpu_icc.icc
+++ b/examples/basic_examples/vector_scal_cpu_icc.icc
@@ -1,26 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This example complements vector_scale.c: here we implement a CPU version,
			
 
				- * meant to be compiled by icc.
			
 
				- */
			
 
				-
			
 
				-#include "vector_scal_cpu_template.h"
			
 
				-
			
 
				-VECTOR_SCAL_CPU_FUNC(scal_cpu_func_icc)
			
 
				-VECTOR_SCAL_SSE_FUNC(scal_sse_func_icc)
			
 
				-
			
--- a/examples/basic_examples/vector_scal_cpu_icc.icc
+++ b/examples/basic_examples/vector_scal_cpu_icc.icc
@@ -0,0 +1 @@
 
				+vector_scal_cpu.c
			
--- a/examples/basic_examples/vector_scal_cpu_template.h
+++ b/examples/basic_examples/vector_scal_cpu_template.h
@@ -1,93 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This example complements vector_scale.c: here we implement a CPU version.
			
 
				- */
			
 
				-
			
 
				-#ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__
			
 
				-#define __VECTOR_SCAL_CPU_TEMPLATE_H__
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#ifdef __SSE__
			
 
				-#include <xmmintrin.h>
			
 
				-#endif
			
 
				-
			
 
				-/* This kernel takes a buffer and scales it by a constant factor */
			
 
				-#define VECTOR_SCAL_CPU_FUNC(func_name)                                        \
			
 
				-void func_name(void *buffers[], void *cl_arg)                                  \
			
 
				-{                                                                              \
			
 
				-	unsigned i;                                                            \
			
 
				-	float *factor = (float *) cl_arg;                                      \
			
 
				-                                                                               \
			
 
				-	/*                                                                     \
			
 
				-	 * The "buffers" array matches the task->handles array: for instance   \
			
 
				-	 * task->handles[0] is a handle that corresponds to a data with        \
			
 
				-	 * vector "interface", so that the first entry of the array in the     \
			
 
				-	 * codelet  is a pointer to a structure describing such a vector (ie.  \
			
 
				-	 * struct starpu_vector_interface *). Here, we therefore manipulate    \
			
 
				-	 * the buffers[0] element as a vector: nx gives the number of elements \
			
 
				-	 * in the array, ptr gives the location of the array (that was possibly \
			
 
				-	 * migrated/replicated), and elemsize gives the size of each elements.  \
			
 
				-	 */                                                                    \
			
 
				-                                                                               \
			
 
				-	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0]; \
			
 
				-                                                                               \
			
 
				-	/* length of the vector */                                             \
			
 
				-	unsigned n = STARPU_VECTOR_GET_NX(vector);                             \
			
 
				-                                                                               \
			
 
				-	/* get a pointer to the local copy of the vector : note that we have to \
			
 
				-	 * cast it in (float *) since a vector could contain any type of       \
			
 
				-	 * elements so that the .ptr field is actually a uintptr_t */          \
			
 
				-	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);                   \
			
 
				-                                                                               \
			
 
				-	/* scale the vector */                                                 \
			
 
				-	for (i = 0; i < n; i++)                                                \
			
 
				-		val[i] *= *factor;                                             \
			
 
				-}
			
 
				-
			
 
				-#ifdef __SSE__
			
 
				-#define VECTOR_SCAL_SSE_FUNC(func_name)                                        \
			
 
				-void func_name(void *buffers[], void *cl_arg)                                  \
			
 
				-{                                                                              \
			
 
				-	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);           \
			
 
				-	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);                     \
			
 
				-	unsigned int n_iterations = n/4;                                       \
			
 
				-                                                                               \
			
 
				-	__m128 *VECTOR = (__m128*) vector;                                     \
			
 
				-	__m128 FACTOR __attribute__((aligned(16)));                            \
			
 
				-	float factor = *(float *) cl_arg;                                      \
			
 
				-	FACTOR = _mm_set1_ps(factor);                                          \
			
 
				-                                                                               \
			
 
				-	unsigned int i;	                                                       \
			
 
				-	for (i = 0; i < n_iterations; i++)                                     \
			
 
				-		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);                     \
			
 
				-                                                                               \
			
 
				-	unsigned int remainder = n%4;                                          \
			
 
				-	if (remainder != 0)                                                    \
			
 
				-	{                                                                      \
			
 
				-		unsigned int start = 4 * n_iterations;                         \
			
 
				-		for (i = start; i < start+remainder; ++i)                      \
			
 
				-		{                                                              \
			
 
				-			vector[i] = factor * vector[i];                        \
			
 
				-		}                                                              \
			
 
				-	}                                                                      \
			
 
				-}
			
 
				-#else /* !__SSE__ */
			
 
				-#define VECTOR_SCAL_SSE_FUNC(func_name)
			
 
				-#endif /* !__SSE__ */
			
 
				-
			
 
				-#endif /* !__VECTOR_SCAL_CPU_TEMPLATE_H__ */
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -20,6 +20,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
@@ -21,6 +21,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 
			
--- a/examples/binary/binary.c
+++ b/examples/binary/binary.c
@@ -1,118 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <pthread.h>
			
 
				-#include <sys/time.h>
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				-struct starpu_opencl_program opencl_program;
			
 
				-#endif
			
 
				-
			
 
				-struct starpu_codelet cl =
			
 
				-{
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_funcs = {opencl_codelet, NULL},
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW}
			
 
				-};
			
 
				-
			
 
				-int compute(char *file_name, int load_as_file)
			
 
				-{
			
 
				-	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
			
 
				-	starpu_data_handle_t float_array_handle;
			
 
				-	unsigned i;
			
 
				-	int ret = 0;
			
 
				-	unsigned niter = 500;
			
 
				-
			
 
				-	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	if (load_as_file)
			
 
				-	{
			
 
				-		ret = starpu_opencl_compile_opencl_from_file(file_name, NULL);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_compile_opencl_from_file");
			
 
				-		ret = starpu_opencl_load_binary_opencl(file_name, &opencl_program);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		char located_file_name[1024];
			
 
				-		char located_dir_name[1024];
			
 
				-		char opencl_program_source[16384];
			
 
				-		starpu_opencl_load_program_source(file_name, located_file_name, located_dir_name, opencl_program_source);
			
 
				-		ret = starpu_opencl_compile_opencl_from_string(opencl_program_source, "incrementer", NULL);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_compile_opencl_from_file");
			
 
				-		ret = starpu_opencl_load_binary_opencl("incrementer", &opencl_program);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	for (i = 0; i < niter; i++)
			
 
				-	{
			
 
				-		ret = starpu_insert_task(&cl, STARPU_RW, float_array_handle, 0);
			
 
				-		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-		{
			
 
				-			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	/* update the array in RAM */
			
 
				-	starpu_data_unregister(float_array_handle);
			
 
				-
			
 
				-	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0], float_array[1], float_array[2], float_array[3]);
			
 
				-
			
 
				-	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
			
 
				-	{
			
 
				-		FPRINTF(stderr, "Incorrect result\n");
			
 
				-		ret = 1;
			
 
				-	}
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	int ret = 0;
			
 
				-	struct starpu_conf conf;
			
 
				-
			
 
				-	starpu_conf_init(&conf);
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.ncuda = 0;
			
 
				-
			
 
				-        ret = starpu_init(&conf);
			
 
				-	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-	{
			
 
				-                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
			
 
				-		starpu_shutdown();
			
 
				-		return 77;
			
 
				-	}
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 1);
			
 
				-	if (ret == 0)
			
 
				-		ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 0);
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-	return ret;
			
 
				-}
			
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -24,8 +24,11 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cublas.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+#include <starpu.h>
			
 
				+
			
 
				 #define DOUBLE
			
 
				 
			
 
				 #ifdef DOUBLE
			
--- a/examples/cg/cg_dot_kernel.cu
+++ b/examples/cg/cg_dot_kernel.cu
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 #include "cg.h"
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -30,6 +30,7 @@
 
				 
			
 
				 #include <common/blas.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_bound.h>
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define NMAXBLOCKS	32
			
@@ -147,7 +148,7 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
				 			check = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
			
 
				+		if (strcmp(argv[i], "-h") == 0)
			
 
				 		{
			
 
				 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				 		}
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,13 +15,16 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				+#include <starpu_config.h>
			
 
				 #include "cholesky.h"
			
 
				 #include "../common/blas.h"
			
 
				-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cuda.h>
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				 #include "magma.h"
			
 
				 #include "magma_lapack.h"
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  *   U22 
			
@@ -193,7 +196,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				 				STARPU_ABORT();
			
 
				 			}
			
 
				-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaError_t cures = cudaThreadSynchronize();
			
 
				 			STARPU_ASSERT(!cures);
			
 
				 			}
			
 
				 #else
			
--- a/examples/common/blas.c
+++ b/examples/common/blas.c
@@ -30,7 +30,7 @@
 
				 #ifdef STARPU_ATLAS
			
 
				 
			
 
				 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				-			float alpha, const float *A, int lda, const float *B, int ldb, 
			
 
				+			float alpha, float *A, int lda, float *B, int ldb, 
			
 
				 			float beta, float *C, int ldc)
			
 
				 {
			
 
				 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
			
@@ -241,7 +241,7 @@ void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
 
				 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
			
 
				 
			
 
				 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				-			float alpha, const float *A, int lda, const float *B, int ldb, 
			
 
				+			float alpha, float *A, int lda, float *B, int ldb, 
			
 
				 			float beta, float *C, int ldc)
			
 
				 {
			
 
				 	sgemm_(transa, transb, &M, &N, &K, &alpha,
			
--- a/examples/common/blas.h
+++ b/examples/common/blas.h
@@ -24,8 +24,8 @@
 
				 #include <cblas.h>
			
 
				 #endif
			
 
				 
			
 
				-void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
			
 
				-		const float *B, int ldb, float beta, float *C, int ldc);
			
 
				+void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, float *A, int lda, 
			
 
				+		float *B, int ldb, float beta, float *C, int ldc);
			
 
				 void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
			
 
				 		double *B, int ldb, double beta, double *C, int ldc);
			
 
				 void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -25,6 +25,7 @@ extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				 extern "C" void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				 struct starpu_opencl_program opencl_program;
			
 
				 #endif
			
--- a/examples/filters/custom_mf/conversion.cu
+++ b/examples/filters/custom_mf/conversion.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
--- a/examples/filters/custom_mf/cuda.cu
+++ b/examples/filters/custom_mf/cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -14,10 +14,20 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_hash.h>
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				 #include "custom_interface.h"
			
 
				 #include "custom_types.h"
			
 
				 
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node);
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				 				  void *dst_interface, unsigned dst_node,
			
 
				 				  cudaStream_t stream);
			
@@ -40,19 +50,19 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 
				 				 void *dst_interface, unsigned dst_node);
			
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				 				    void *dst_interface, unsigned dst_node,
			
 
				-				    cl_event *event);
			
 
				+				    void *event);
			
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				 				    void *dst_interface, unsigned dst_node,
			
 
				-				    cl_event *event);
			
 
				+				    void *event);
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 static struct starpu_data_copy_methods custom_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = NULL,
			
 
				+	.ram_to_ram = copy_ram_to_ram,
			
 
				 	.ram_to_spu = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda        = NULL,
			
 
				-	.cuda_to_ram        = NULL,
			
 
				+	.ram_to_cuda        = copy_ram_to_cuda,
			
 
				+	.cuda_to_ram        = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async  = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async  = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda       = copy_cuda_to_cuda,
			
@@ -81,6 +91,7 @@ static void*    custom_handle_to_pointer(starpu_data_handle_t data_handle,
 
				 static void     free_custom_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t   custom_interface_get_size(starpu_data_handle_t handle);
			
 
				 static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle);
			
 
				+static int      custom_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void     display_custom_interface(starpu_data_handle_t handle, FILE *f);
			
 
				 static uint32_t custom_get_nx(starpu_data_handle_t handle);
			
 
				 
			
@@ -102,7 +113,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 
				 	.copy_methods          = &custom_copy_data_methods_s,
			
 
				 	.get_size              = custom_interface_get_size,
			
 
				 	.footprint             = footprint_custom_interface_crc32,
			
 
				-	.compare               = NULL,
			
 
				+	.compare               = custom_compare,
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 	.convert_to_gordon     = NULL,
			
 
				 #endif
			
@@ -318,11 +329,16 @@ static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
 
				 	return starpu_crc32_be(custom_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				+static int custom_compare(void *data_interface_a, void *data_interface_b)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				 static void display_custom_interface(starpu_data_handle_t handle, FILE *f)
			
 
				 {
			
 
				-	struct custom_data_interface *ci = (struct custom_data_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, 0);
			
 
				-	fprintf(f, "Custom interface of size %d", ci->nx);
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				 }
			
 
				 
			
 
				 static uint32_t
			
@@ -341,6 +357,7 @@ void custom_data_register(starpu_data_handle_t *handle,
 
				 				 uint32_t nx,
			
 
				 				 struct starpu_multiformat_data_interface_ops *format_ops)
			
 
				 {
			
 
				+	/* XXX Deprecated fields ? */
			
 
				 	struct custom_data_interface custom =
			
 
				 	{
			
 
				 		.cpu_ptr = ptr,
			
@@ -360,7 +377,26 @@ void custom_data_register(starpu_data_handle_t *handle,
 
				 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
			
 
				 }
			
 
				 
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				 static int
			
 
				 copy_cuda_common_async(void *src_interface, unsigned src_node,
			
 
				 		       void *dst_interface, unsigned dst_node,
			
@@ -477,7 +513,7 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 
				 
			
 
				 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				 				    void *dst_interface, unsigned dst_node,
			
 
				-				    cl_event *event)
			
 
				+				    void *event)
			
 
				 {
			
 
				 	ssize_t size;
			
 
				 	struct custom_data_interface *src_custom, *dst_custom;
			
@@ -520,7 +556,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
				 
			
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				 				    void *dst_interface, unsigned dst_node,
			
 
				-				    cl_event *event)
			
 
				+				    void *event)
			
 
				 {
			
 
				 	ssize_t size;
			
 
				 	struct custom_data_interface *src_custom, *dst_custom;
			
--- a/examples/filters/custom_mf/custom_mf_filter.c
+++ b/examples/filters/custom_mf/custom_mf_filter.c
@@ -16,6 +16,9 @@
 
				 #include <starpu.h>
			
 
				 #include "custom_interface.h"
			
 
				 #include "custom_types.h"
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 #define N 12
			
 
				 
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 #define NX    5
			
 
				 #define NY    4
			
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
			
 
				 {
			
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 #define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
			
 
				 do                                                          \
			
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -101,7 +101,6 @@ int main(int argc, char **argv)
 
				 	return 0;
			
 
				 
			
 
				 enodev:
			
 
				-	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				 	starpu_shutdown();
			
 
				 	return 77;
			
 
				 }
			
--- a/examples/filters/shadow.c
+++ b/examples/filters/shadow.c
@@ -1,189 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This examplifies the use of the shadow filter: a source "vector" of NX
			
 
				- * elements (plus 2*SHADOW wrap-around elements) is partitioned into vectors
			
 
				- * with some shadowing, and these are copied into a destination "vector2" of
			
 
				- * NRPARTS*(NX/NPARTS+2*SHADOW) elements, partitioned in the traditionnal way,
			
 
				- * thus showing how shadowing shows up.
			
 
				- *
			
 
				- * For instance, with NX=8, SHADOW=1, and NPARTS=4:
			
 
				- *
			
 
				- * vector
			
 
				- * x0 x1 x2 x3 x4 x5 x6 x7 x8 x9
			
 
				- *
			
 
				- * is partitioned into 4 pieces:
			
 
				- *
			
 
				- * x0 x1 x2 x3
			
 
				- *       x2 x3 x4 x5
			
 
				- *             x4 x5 x6 x7
			
 
				- *                   x6 x7 x8 x9
			
 
				- *
			
 
				- * which are copied into the 4 destination subparts of vector2, thus getting in
			
 
				- * the end:
			
 
				- *
			
 
				- * x0 x1 x2 x3 x2 x3 x4 x5 x4 x5 x6 x7 x6 x7 x8 x9
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-/* Shadow width */
			
 
				-#define SHADOW 2
			
 
				-#define NX    30
			
 
				-#define PARTS 3
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-void cpu_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        unsigned i;
			
 
				-
			
 
				-        /* length of the shadowed source vector */
			
 
				-        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-        /* local copy of the shadowed source vector pointer */
			
 
				-        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination vector */
			
 
				-        unsigned n2 = STARPU_VECTOR_GET_NX(buffers[1]);
			
 
				-        /* local copy of the destination vector pointer */
			
 
				-        int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(n == n2);
			
 
				-	for (i = 0; i < n; i++)
			
 
				-		val2[i] = val[i];
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void cuda_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        /* length of the shadowed source vector */
			
 
				-        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-        /* local copy of the shadowed source vector pointer */
			
 
				-        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination vector */
			
 
				-        unsigned n2 = STARPU_VECTOR_GET_NX(buffers[1]);
			
 
				-        /* local copy of the destination vector pointer */
			
 
				-        int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(n == n2);
			
 
				-	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	unsigned i, j;
			
 
				-        int vector[NX + 2*SHADOW];
			
 
				-        int vector2[NX + PARTS*2*SHADOW];
			
 
				-	starpu_data_handle_t handle, handle2;
			
 
				-	int ret;
			
 
				-
			
 
				-        struct starpu_codelet cl =
			
 
				-	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				-                .cpu_funcs = {cpu_func, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-                .cuda_funcs = {cuda_func, NULL},
			
 
				-#endif
			
 
				-                .nbuffers = 2,
			
 
				-		.modes = {STARPU_R, STARPU_W}
			
 
				-        };
			
 
				-
			
 
				-        for(i=0 ; i<NX ; i++) vector[SHADOW+i] = i;
			
 
				-	for(i=0 ; i<SHADOW ; i++) vector[i] = vector[i+NX];
			
 
				-	for(i=0 ; i<SHADOW ; i++) vector[SHADOW+NX+i] = vector[SHADOW+i];
			
 
				-        FPRINTF(stderr,"IN  Vector: ");
			
 
				-        for(i=0 ; i<NX + 2*SHADOW ; i++) FPRINTF(stderr, "%5d ", vector[i]);
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-
			
 
				-	ret = starpu_init(NULL);
			
 
				-	if (ret == -ENODEV)
			
 
				-		exit(77);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	/* Declare source vector to StarPU */
			
 
				-	starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX + 2*SHADOW, sizeof(vector[0]));
			
 
				-
			
 
				-	/* Declare destination vector to StarPU */
			
 
				-	starpu_vector_data_register(&handle2, 0, (uintptr_t)vector2, NX + PARTS*2*SHADOW, sizeof(vector[0]));
			
 
				-
			
 
				-        /* Partition the source vector in PARTS sub-vectors with shadows */
			
 
				-	/* NOTE: the resulting handles should only be used in read-only mode,
			
 
				-	 * as StarPU will not know how the overlapping parts would have to be
			
 
				-	 * combined. */
			
 
				-	struct starpu_data_filter f =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func_vector,
			
 
				-		.nchildren = PARTS,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOW /* Shadow width */
			
 
				-	};
			
 
				-	starpu_data_partition(handle, &f);
			
 
				-
			
 
				-        /* Partition the destination vector in PARTS sub-vectors */
			
 
				-	struct starpu_data_filter f2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				-		.nchildren = PARTS,
			
 
				-	};
			
 
				-	starpu_data_partition(handle2, &f2);
			
 
				-
			
 
				-        /* Submit a task on each sub-vector */
			
 
				-	for (i=0; i<starpu_data_get_nb_children(handle); i++)
			
 
				-	{
			
 
				-                starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
			
 
				-                starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 1, i);
			
 
				-                struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-		task->handles[0] = sub_handle;
			
 
				-		task->handles[1] = sub_handle2;
			
 
				-                task->cl = &cl;
			
 
				-                task->synchronous = 1;
			
 
				-
			
 
				-		ret = starpu_task_submit(task);
			
 
				-		if (ret == -ENODEV) goto enodev;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	}
			
 
				-
			
 
				-	starpu_data_unpartition(handle, 0);
			
 
				-	starpu_data_unpartition(handle2, 0);
			
 
				-        starpu_data_unregister(handle);
			
 
				-        starpu_data_unregister(handle2);
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-        FPRINTF(stderr,"OUT Vector: ");
			
 
				-        for(i=0 ; i<NX + PARTS*2*SHADOW ; i++) FPRINTF(stderr, "%5d ", vector2[i]);
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-	for(i=0 ; i<PARTS ; i++)
			
 
				-		for (j=0 ; j<NX/PARTS ; j++)
			
 
				-			STARPU_ASSERT(vector2[i*(NX/PARTS+2*SHADOW)+j] == vector[i*(NX/PARTS)+j]);
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-enodev:
			
 
				-	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				-	starpu_shutdown();
			
 
				-	return 77;
			
 
				-}
			
--- a/examples/filters/shadow2d.c
+++ b/examples/filters/shadow2d.c
@@ -1,291 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This examplifies the use of the matrix shadow filters: a source "matrix" of
			
 
				- * NX*NY elements (plus 2*NX*SHADOWX+2*NY*SHADOWY+4*SHADOWX*SHADOWY wrap-around
			
 
				- * elements) is partitioned into matrices with some shadowing, and these are
			
 
				- * copied into a destination "matrix2" of
			
 
				- * NRPARTSX*NPARTSY*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)) elements,
			
 
				- * partitioned in the traditionnal way, thus showing how shadowing shows up.
			
 
				- *
			
 
				- * For instance, with NX=NY=8, SHADOWX=SHADOWY=1, and NPARTSX=NPARTSY=4:
			
 
				- *
			
 
				- * matrix
			
 
				- * 0123456789
			
 
				- * 1234567890
			
 
				- * 2345678901
			
 
				- * 3456789012
			
 
				- * 4567890123
			
 
				- * 5678901234
			
 
				- * 6789012345
			
 
				- * 7890123456
			
 
				- * 8901234567
			
 
				- * 9012345678
			
 
				- *
			
 
				- * is partitioned into 4*4 pieces:
			
 
				- *
			
 
				- * 0123 2345 4567 6789
			
 
				- * 1234 3456 5678 7890
			
 
				- * 2345 4567 6789 8901
			
 
				- * 3456 5678 7890 9012
			
 
				- *
			
 
				- * 2345 4567 6789 8901
			
 
				- * 3456 5678 7890 9012
			
 
				- * 4567 6789 8901 0123
			
 
				- * 5678 7890 9012 1234
			
 
				- *
			
 
				- * 4567 6789 8901 0123
			
 
				- * 5678 7890 9012 1234
			
 
				- * 6789 8901 0123 2345
			
 
				- * 7890 9012 1234 3456
			
 
				- *
			
 
				- * 6789 8901 0123 2345
			
 
				- * 7890 9012 1234 3456
			
 
				- * 8901 0123 2345 4567
			
 
				- * 9012 1234 3456 5678
			
 
				- *
			
 
				- * which are copied into the 4*4 destination subparts of matrix2, thus getting in
			
 
				- * the end:
			
 
				- *
			
 
				- * 0123234545676789
			
 
				- * 1234345656787890
			
 
				- * 2345456767898901
			
 
				- * 3456567878909012
			
 
				- * 2345456767898901
			
 
				- * 3456567878909012
			
 
				- * 4567678989010123
			
 
				- * 5678789090121234
			
 
				- * 4567678989010123
			
 
				- * 5678789090121234
			
 
				- * 6789890101232345
			
 
				- * 7890901212343456
			
 
				- * 6789890101232345
			
 
				- * 7890901212343456
			
 
				- * 8901012323454567
			
 
				- * 9012123434565678
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-/* Shadow width */
			
 
				-#define SHADOWX 3
			
 
				-#define SHADOWY 2
			
 
				-#define NX    20
			
 
				-#define NY    30
			
 
				-#define PARTSX 2
			
 
				-#define PARTSY 3
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-void cpu_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        /* length of the shadowed source matrix */
			
 
				-        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				-        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				-        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				-        /* local copy of the shadowed source matrix pointer */
			
 
				-        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination matrix */
			
 
				-        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
			
 
				-        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
			
 
				-        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
			
 
				-        /* local copy of the destination matrix pointer */
			
 
				-        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	unsigned i, j;
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(n == n2);
			
 
				-	STARPU_ASSERT(m == m2);
			
 
				-	for (j = 0; j < m; j++)
			
 
				-		for (i = 0; i < n; i++)
			
 
				-			val2[j*ld2+i] = val[j*ld+i];
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void cuda_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        /* length of the shadowed source matrix */
			
 
				-        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				-        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				-        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				-        /* local copy of the shadowed source matrix pointer */
			
 
				-        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination matrix */
			
 
				-        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
			
 
				-        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
			
 
				-        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
			
 
				-        /* local copy of the destination matrix pointer */
			
 
				-        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(n == n2);
			
 
				-	STARPU_ASSERT(m == m2);
			
 
				-	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	unsigned i, j, k, l;
			
 
				-        int matrix[NY + 2*SHADOWY][NX + 2*SHADOWX];
			
 
				-        int matrix2[NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
			
 
				-	starpu_data_handle_t handle, handle2;
			
 
				-	int ret;
			
 
				-
			
 
				-        struct starpu_codelet cl =
			
 
				-	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				-                .cpu_funcs = {cpu_func, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-                .cuda_funcs = {cuda_func, NULL},
			
 
				-#endif
			
 
				-                .nbuffers = 2,
			
 
				-		.modes = {STARPU_R, STARPU_W}
			
 
				-        };
			
 
				-
			
 
				-	memset(matrix, -1, sizeof(matrix));
			
 
				-	for(j=1 ; j<=NY ; j++)
			
 
				-		for(i=1 ; i<=NX ; i++)
			
 
				-			matrix[SHADOWY+j-1][SHADOWX+i-1] = i+j;
			
 
				-
			
 
				-	/* Copy borders */
			
 
				-	for (j = SHADOWY ; j<SHADOWY+NY ; j++)
			
 
				-		for(i=0 ; i<SHADOWX ; i++) {
			
 
				-			matrix[j][i] = matrix[j][i+NX];
			
 
				-			matrix[j][SHADOWX+NX+i] = matrix[j][SHADOWX+i];
			
 
				-		}
			
 
				-	for(j=0 ; j<SHADOWY ; j++)
			
 
				-		for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
			
 
				-			matrix[j][i] = matrix[j+NY][i];
			
 
				-			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i];
			
 
				-		}
			
 
				-	/* Copy corners */
			
 
				-	for(j=0 ; j<SHADOWY ; j++)
			
 
				-		for(i=0 ; i<SHADOWX ; i++) {
			
 
				-			matrix[j][i] = matrix[j+NY][i+NX];
			
 
				-			matrix[j][SHADOWX+NX+i] = matrix[j+NY][SHADOWX+i];
			
 
				-			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i+NX];
			
 
				-			matrix[SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWY+j][SHADOWX+i];
			
 
				-		}
			
 
				-
			
 
				-        FPRINTF(stderr,"IN  Matrix:\n");
			
 
				-	for(j=0 ; j<NY + 2*SHADOWY ; j++)
			
 
				-	{
			
 
				-		for(i=0 ; i<NX + 2*SHADOWX ; i++)
			
 
				-			FPRINTF(stderr, "%5d ", matrix[j][i]);
			
 
				-		FPRINTF(stderr,"\n");
			
 
				-	}
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-
			
 
				-	ret = starpu_init(NULL);
			
 
				-	if (ret == -ENODEV)
			
 
				-		exit(77);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	/* Declare source matrix to StarPU */
			
 
				-	starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX + 2*SHADOWX, NX + 2*SHADOWX, NY + 2*SHADOWY, sizeof(matrix[0][0]));
			
 
				-
			
 
				-	/* Declare destination matrix to StarPU */
			
 
				-	starpu_matrix_data_register(&handle2, 0, (uintptr_t)matrix2, NX + PARTSX*2*SHADOWX, NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, sizeof(matrix2[0][0]));
			
 
				-
			
 
				-        /* Partition the source matrix in PARTSY*PARTSX sub-matrices with shadows */
			
 
				-	/* NOTE: the resulting handles should only be used in read-only mode,
			
 
				-	 * as StarPU will not know how the overlapping parts would have to be
			
 
				-	 * combined. */
			
 
				-	struct starpu_data_filter fy =
			
 
				-	{
			
 
				-		.filter_func = starpu_vertical_block_shadow_filter_func,
			
 
				-		.nchildren = PARTSY,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
			
 
				-	};
			
 
				-	struct starpu_data_filter fx =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func,
			
 
				-		.nchildren = PARTSX,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
			
 
				-	};
			
 
				-	starpu_data_map_filters(handle, 2, &fy, &fx);
			
 
				-
			
 
				-        /* Partition the destination matrix in PARTSY*PARTSX sub-matrices */
			
 
				-	struct starpu_data_filter fy2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				-		.nchildren = PARTSY,
			
 
				-	};
			
 
				-	struct starpu_data_filter fx2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				-		.nchildren = PARTSX,
			
 
				-	};
			
 
				-	starpu_data_map_filters(handle2, 2, &fy2, &fx2);
			
 
				-
			
 
				-        /* Submit a task on each sub-matrix */
			
 
				-	for (j=0; j<PARTSY; j++)
			
 
				-	{
			
 
				-		for (i=0; i<PARTSX; i++)
			
 
				-		{
			
 
				-			starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 2, j, i);
			
 
				-			starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 2, j, i);
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-			task->handles[0] = sub_handle;
			
 
				-			task->handles[1] = sub_handle2;
			
 
				-			task->cl = &cl;
			
 
				-			task->synchronous = 1;
			
 
				-
			
 
				-			ret = starpu_task_submit(task);
			
 
				-			if (ret == -ENODEV) goto enodev;
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_data_unpartition(handle, 0);
			
 
				-	starpu_data_unpartition(handle2, 0);
			
 
				-        starpu_data_unregister(handle);
			
 
				-        starpu_data_unregister(handle2);
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-        FPRINTF(stderr,"OUT Matrix:\n");
			
 
				-	for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
			
 
				-	{
			
 
				-		for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++)
			
 
				-			FPRINTF(stderr, "%5d ", matrix2[j][i]);
			
 
				-		FPRINTF(stderr,"\n");
			
 
				-	}
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-	for(j=0 ; j<PARTSY ; j++)
			
 
				-		for(i=0 ; i<PARTSX ; i++)
			
 
				-			for (l=0 ; l<NY/PARTSY + 2*SHADOWY ; l++)
			
 
				-				for (k=0 ; k<NX/PARTSX + 2*SHADOWX ; k++)
			
 
				-					STARPU_ASSERT(matrix2[j*(NY/PARTSY+2*SHADOWY)+l][i*(NX/PARTSX+2*SHADOWX)+k] == matrix[j*(NY/PARTSY)+l][i*(NX/PARTSX)+k]);
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-enodev:
			
 
				-	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				-	starpu_shutdown();
			
 
				-	return 77;
			
 
				-}
			
--- a/examples/filters/shadow3d.c
+++ b/examples/filters/shadow3d.c
@@ -1,331 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This examplifies the use of the 3D matrix shadow filters: a source "matrix" of
			
 
				- * NX*NY*NZ elements (plus SHADOW wrap-around elements) is partitioned into
			
 
				- * matrices with some shadowing, and these are copied into a destination
			
 
				- * "matrix2" of
			
 
				- * NRPARTSX*NPARTSY*NPARTSZ*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)*(NZ/NPARTSZ+2*SHADOWZ))
			
 
				- * elements, partitioned in the traditionnal way, thus showing how shadowing
			
 
				- * shows up.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-/* Shadow width */
			
 
				-#define SHADOWX 2
			
 
				-#define SHADOWY 3
			
 
				-#define SHADOWZ 4
			
 
				-#define NX    12
			
 
				-#define NY    9
			
 
				-#define NZ    6
			
 
				-#define PARTSX 4
			
 
				-#define PARTSY 3
			
 
				-#define PARTSZ 2
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-void cpu_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        /* length of the shadowed source matrix */
			
 
				-        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
			
 
				-        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				-        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				-        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				-        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
			
 
				-        /* local copy of the shadowed source matrix pointer */
			
 
				-        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination matrix */
			
 
				-        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
			
 
				-        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
			
 
				-        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
			
 
				-        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
			
 
				-        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
			
 
				-        /* local copy of the destination matrix pointer */
			
 
				-        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	unsigned i, j, k;
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(x == x2);
			
 
				-	STARPU_ASSERT(y == y2);
			
 
				-	STARPU_ASSERT(z == z2);
			
 
				-	for (k = 0; k < z; k++)
			
 
				-		for (j = 0; j < y; j++)
			
 
				-			for (i = 0; i < x; i++)
			
 
				-				val2[k*ldz2+j*ldy2+i] = val[k*ldz+j*ldy+i];
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void cuda_func(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-        /* length of the shadowed source matrix */
			
 
				-        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
			
 
				-        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				-        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				-        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				-        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
			
 
				-        /* local copy of the shadowed source matrix pointer */
			
 
				-        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				-
			
 
				-        /* length of the destination matrix */
			
 
				-        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
			
 
				-        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
			
 
				-        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
			
 
				-        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
			
 
				-        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
			
 
				-        /* local copy of the destination matrix pointer */
			
 
				-        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
			
 
				-
			
 
				-	unsigned k;
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	/* If things go right, sizes should match */
			
 
				-	STARPU_ASSERT(x == x2);
			
 
				-	STARPU_ASSERT(y == y2);
			
 
				-	STARPU_ASSERT(z == z2);
			
 
				-	for (k = 0; k < z; k++) {
			
 
				-		cures = cudaMemcpy2DAsync(val2+k*ldz2, ldy2*sizeof(*val2), val+k*ldz, ldy*sizeof(*val),
			
 
				-				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				-		STARPU_ASSERT(!cures);
			
 
				-	}
			
 
				-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	STARPU_ASSERT(!cures);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	unsigned i, j, k, l, m, n;
			
 
				-        int matrix[NZ + 2*SHADOWZ][NY + 2*SHADOWY][NX + 2*SHADOWX];
			
 
				-        int matrix2[NZ + PARTSZ*2*SHADOWZ][NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
			
 
				-	starpu_data_handle_t handle, handle2;
			
 
				-	int ret;
			
 
				-
			
 
				-        struct starpu_codelet cl =
			
 
				-	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				-                .cpu_funcs = {cpu_func, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-                .cuda_funcs = {cuda_func, NULL},
			
 
				-#endif
			
 
				-                .nbuffers = 2,
			
 
				-		.modes = {STARPU_R, STARPU_W}
			
 
				-        };
			
 
				-
			
 
				-	memset(matrix, -1, sizeof(matrix));
			
 
				-	for(k=1 ; k<=NZ ; k++)
			
 
				-		for(j=1 ; j<=NY ; j++)
			
 
				-			for(i=1 ; i<=NX ; i++)
			
 
				-				matrix[SHADOWZ+k-1][SHADOWY+j-1][SHADOWX+i-1] = i+j+k;
			
 
				-
			
 
				-	/* Copy planes */
			
 
				-	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
			
 
				-		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
			
 
				-			for(i=0 ; i<SHADOWX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k][j][i+NX];
			
 
				-				matrix[k][j][SHADOWX+NX+i] = matrix[k][j][SHADOWX+i];
			
 
				-			}
			
 
				-	for(k=SHADOWZ ; k<SHADOWZ+NZ ; k++)
			
 
				-		for(j=0 ; j<SHADOWY ; j++)
			
 
				-			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k][j+NY][i];
			
 
				-				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i];
			
 
				-			}
			
 
				-	for(k=0 ; k<SHADOWZ ; k++)
			
 
				-		for(j=SHADOWY ; j<SHADOWY+NY ; j++)
			
 
				-			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k+NZ][j][i];
			
 
				-				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i];
			
 
				-			}
			
 
				-
			
 
				-	/* Copy borders */
			
 
				-	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
			
 
				-		for(j=0 ; j<SHADOWY ; j++)
			
 
				-			for(i=0 ; i<SHADOWX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k][j+NY][i+NX];
			
 
				-				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i+NX];
			
 
				-				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k][SHADOWY+j][SHADOWX+i];
			
 
				-				matrix[k][j][SHADOWX+NX+i] = matrix[k][j+NY][SHADOWX+i];
			
 
				-			}
			
 
				-	for(k=0 ; k<SHADOWZ ; k++)
			
 
				-		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
			
 
				-			for(i=0 ; i<SHADOWX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k+NZ][j][i+NX];
			
 
				-				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i+NX];
			
 
				-				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j][SHADOWX+i];
			
 
				-				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j][SHADOWX+i];
			
 
				-			}
			
 
				-	for(k=0 ; k<SHADOWZ ; k++)
			
 
				-		for(j=0 ; j<SHADOWY ; j++)
			
 
				-			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k+NZ][j+NY][i];
			
 
				-				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i];
			
 
				-				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i];
			
 
				-				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i];
			
 
				-			}
			
 
				-
			
 
				-	/* Copy corners */
			
 
				-	for(k=0 ; k<SHADOWZ ; k++)
			
 
				-		for(j=0 ; j<SHADOWY ; j++)
			
 
				-			for(i=0 ; i<SHADOWX ; i++) {
			
 
				-				matrix[k][j][i] = matrix[k+NZ][j+NY][i+NX];
			
 
				-				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j+NY][SHADOWX+i];
			
 
				-				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i+NX];
			
 
				-				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k+NZ][SHADOWY+j][SHADOWX+i];
			
 
				-				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i+NX];
			
 
				-				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j+NY][SHADOWX+i];
			
 
				-				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i+NX];
			
 
				-				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWZ+k][SHADOWY+j][SHADOWX+i];
			
 
				-			}
			
 
				-
			
 
				-        FPRINTF(stderr,"IN  Matrix:\n");
			
 
				-	for(k=0 ; k<NZ + 2*SHADOWZ ; k++)
			
 
				-	{
			
 
				-		for(j=0 ; j<NY + 2*SHADOWY ; j++)
			
 
				-		{
			
 
				-			for(i=0 ; i<NX + 2*SHADOWX ; i++)
			
 
				-				FPRINTF(stderr, "%5d ", matrix[k][j][i]);
			
 
				-			FPRINTF(stderr,"\n");
			
 
				-		}
			
 
				-		FPRINTF(stderr,"\n\n");
			
 
				-	}
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-
			
 
				-	ret = starpu_init(NULL);
			
 
				-	if (ret == -ENODEV)
			
 
				-		exit(77);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	/* Declare source matrix to StarPU */
			
 
				-	starpu_block_data_register(&handle, 0, (uintptr_t)matrix,
			
 
				-			NX + 2*SHADOWX, (NX + 2*SHADOWX) * (NY + 2*SHADOWY),
			
 
				-			NX + 2*SHADOWX, NY + 2*SHADOWY, NZ + 2*SHADOWZ,
			
 
				-			sizeof(matrix[0][0][0]));
			
 
				-
			
 
				-	/* Declare destination matrix to StarPU */
			
 
				-	starpu_block_data_register(&handle2, 0, (uintptr_t)matrix2,
			
 
				-			NX + PARTSX*2*SHADOWX, (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY),
			
 
				-			NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, NZ + PARTSZ*2*SHADOWZ,
			
 
				-			sizeof(matrix2[0][0][0]));
			
 
				-
			
 
				-        /* Partition the source matrix in PARTSZ*PARTSY*PARTSX sub-matrices with shadows */
			
 
				-	/* NOTE: the resulting handles should only be used in read-only mode,
			
 
				-	 * as StarPU will not know how the overlapping parts would have to be
			
 
				-	 * combined. */
			
 
				-	struct starpu_data_filter fz =
			
 
				-	{
			
 
				-		.filter_func = starpu_depth_block_shadow_filter_func_block,
			
 
				-		.nchildren = PARTSZ,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
			
 
				-	};
			
 
				-	struct starpu_data_filter fy =
			
 
				-	{
			
 
				-		.filter_func = starpu_vertical_block_shadow_filter_func_block,
			
 
				-		.nchildren = PARTSY,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
			
 
				-	};
			
 
				-	struct starpu_data_filter fx =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func_block,
			
 
				-		.nchildren = PARTSX,
			
 
				-		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
			
 
				-	};
			
 
				-	starpu_data_map_filters(handle, 3, &fz, &fy, &fx);
			
 
				-
			
 
				-        /* Partition the destination matrix in PARTSZ*PARTSY*PARTSX sub-matrices */
			
 
				-	struct starpu_data_filter fz2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_depth_block_filter_func_block,
			
 
				-		.nchildren = PARTSZ,
			
 
				-	};
			
 
				-	struct starpu_data_filter fy2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func_block,
			
 
				-		.nchildren = PARTSY,
			
 
				-	};
			
 
				-	struct starpu_data_filter fx2 =
			
 
				-	{
			
 
				-		.filter_func = starpu_block_filter_func_block,
			
 
				-		.nchildren = PARTSX,
			
 
				-	};
			
 
				-	starpu_data_map_filters(handle2, 3, &fz2, &fy2, &fx2);
			
 
				-
			
 
				-        /* Submit a task on each sub-matrix */
			
 
				-	for (k=0; k<PARTSZ; k++)
			
 
				-	{
			
 
				-		for (j=0; j<PARTSY; j++)
			
 
				-		{
			
 
				-			for (i=0; i<PARTSX; i++)
			
 
				-			{
			
 
				-				starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 3, k, j, i);
			
 
				-				starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 3, k, j, i);
			
 
				-				struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-				task->handles[0] = sub_handle;
			
 
				-				task->handles[1] = sub_handle2;
			
 
				-				task->cl = &cl;
			
 
				-				task->synchronous = 1;
			
 
				-
			
 
				-				ret = starpu_task_submit(task);
			
 
				-				if (ret == -ENODEV) goto enodev;
			
 
				-				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_data_unpartition(handle, 0);
			
 
				-	starpu_data_unpartition(handle2, 0);
			
 
				-        starpu_data_unregister(handle);
			
 
				-        starpu_data_unregister(handle2);
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-        FPRINTF(stderr,"OUT Matrix:\n");
			
 
				-	for(k=0 ; k<NZ + PARTSZ*2*SHADOWZ ; k++)
			
 
				-	{
			
 
				-		for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
			
 
				-		{
			
 
				-			for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++) {
			
 
				-				FPRINTF(stderr, "%5d ", matrix2[k][j][i]);
			
 
				-			}
			
 
				-			FPRINTF(stderr,"\n");
			
 
				-		}
			
 
				-		FPRINTF(stderr,"\n\n");
			
 
				-	}
			
 
				-        FPRINTF(stderr,"\n");
			
 
				-	for(k=0 ; k<PARTSZ ; k++)
			
 
				-		for(j=0 ; j<PARTSY ; j++)
			
 
				-			for(i=0 ; i<PARTSX ; i++)
			
 
				-				for (n=0 ; n<NZ/PARTSZ + 2*SHADOWZ ; n++)
			
 
				-					for (m=0 ; m<NY/PARTSY + 2*SHADOWY ; m++)
			
 
				-						for (l=0 ; l<NX/PARTSX + 2*SHADOWX ; l++)
			
 
				-							STARPU_ASSERT(matrix2[k*(NZ/PARTSZ+2*SHADOWZ)+n][j*(NY/PARTSY+2*SHADOWY)+m][i*(NX/PARTSX+2*SHADOWX)+l] ==
			
 
				-									matrix[k*(NZ/PARTSZ)+n][j*(NY/PARTSY)+m][i*(NX/PARTSX)+l]);
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-enodev:
			
 
				-	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				-	starpu_shutdown();
			
 
				-	return 77;
			
 
				-}
			
--- a/examples/gl_interop/gl_interop.c
+++ b/examples/gl_interop/gl_interop.c
@@ -1,131 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012 Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This example demonstrates how to use StarPU combined with OpenGL rendering,
			
 
				- * which needs:
			
 
				- *
			
 
				- * - initializing GLUT first,
			
 
				- * - enabling it at initialization,
			
 
				- * - running the corresponding CUDA worker in the GLUT thread (here, the main
			
 
				- *   thread).
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <unistd.h>
			
 
				-#include <GL/glut.h>
			
 
				-
			
 
				-void dummy(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-
			
 
				-	printf("Codelet running\n");
			
 
				-	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	printf("Codelet done\n");
			
 
				-}
			
 
				-
			
 
				-struct starpu_codelet cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				-	.cuda_funcs = { dummy, NULL },
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = { STARPU_W },
			
 
				-};
			
 
				-
			
 
				-void foo(void) {
			
 
				-}
			
 
				-
			
 
				-void display(float i) {
			
 
				-	glClear(GL_COLOR_BUFFER_BIT);
			
 
				-	glColor3f(1, 1, 1);
			
 
				-	glBegin(GL_LINES);
			
 
				-	glVertex2f(-i, -i);
			
 
				-	glVertex2f(i, i);
			
 
				-	glEnd();
			
 
				-	glFinish();
			
 
				-	glutPostRedisplay();
			
 
				-	glutMainLoopEvent();
			
 
				-}
			
 
				-
			
 
				-void callback_func(void *foo) {
			
 
				-	printf("Callback running, rendering\n");
			
 
				-	float i = 1.;
			
 
				-	while (i > 0) {
			
 
				-		usleep(100000);
			
 
				-		display(i);
			
 
				-		i -= 0.1;
			
 
				-	}
			
 
				-	printf("rendering done\n");
			
 
				-
			
 
				-	/* Tell it was already the last submitted task */
			
 
				-	starpu_drivers_request_termination();
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
			
 
				-	return 77;
			
 
				-#else
			
 
				-	struct starpu_conf conf;
			
 
				-	int cuda_device = 0;
			
 
				-	int cuda_devices[] = { cuda_device };
			
 
				-	struct starpu_driver drivers[] = {
			
 
				-		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
			
 
				-	};
			
 
				-	int ret;
			
 
				-	struct starpu_task *task;
			
 
				-	starpu_data_handle_t handle;
			
 
				-
			
 
				-	glutInit(&argc, argv);
			
 
				-	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
			
 
				-	glutInitWindowPosition(0, 0);
			
 
				-	glutInitWindowSize(300,200);
			
 
				-	glutCreateWindow("StarPU OpenGL interoperability test");
			
 
				-	glClearColor (0.5, 0.5, 0.5, 0.0);
			
 
				-
			
 
				-	/* Enable OpenGL interoperability */
			
 
				-	starpu_conf_init(&conf);
			
 
				-	conf.ncuda = 1;
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.nopencl = 0;
			
 
				-	conf.cuda_opengl_interoperability = cuda_devices;
			
 
				-	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
			
 
				-	conf.not_launched_drivers = drivers;
			
 
				-	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
			
 
				-	ret = starpu_init(&conf);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
			
 
				-
			
 
				-	/* Submit just one dumb task */
			
 
				-	task = starpu_task_create();
			
 
				-	task->cl = &cl;
			
 
				-	task->handles[0] = handle;
			
 
				-	task->callback_func = callback_func;
			
 
				-	task->callback_arg = NULL;
			
 
				-	ret = starpu_task_submit(task);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-
			
 
				-	/* And run the driver inside main, which will run the task */
			
 
				-	printf("running the driver\n");
			
 
				-	starpu_driver_run(&drivers[0]);
			
 
				-	printf("finished running the driver\n");
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-#endif
			
 
				-}
			
--- a/examples/gl_interop/gl_interop_idle.c
+++ b/examples/gl_interop/gl_interop_idle.c
@@ -1,154 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012 Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This example demonstrates how to use StarPU combined with OpenGL rendering,
			
 
				- * which needs:
			
 
				- *
			
 
				- * - initializing GLUT first,
			
 
				- * - enabling it at initialization,
			
 
				- * - running the corresponding CUDA worker in the GLUT thread (here, the main
			
 
				- *   thread).
			
 
				- *
			
 
				- * The difference with gl_interop.c is that this version runs StarPU Tasks in
			
 
				- * the glut idle handler.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <unistd.h>
			
 
				-#include <GL/glut.h>
			
 
				-
			
 
				-void dummy(void *buffers[], void *cl_arg)
			
 
				-{
			
 
				-	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-
			
 
				-	printf("Codelet running\n");
			
 
				-	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	printf("Codelet done\n");
			
 
				-}
			
 
				-
			
 
				-struct starpu_codelet cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				-	.cuda_funcs = { dummy, NULL },
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = { STARPU_W },
			
 
				-};
			
 
				-
			
 
				-void foo(void) {
			
 
				-}
			
 
				-
			
 
				-void display(float i) {
			
 
				-	glClear(GL_COLOR_BUFFER_BIT);
			
 
				-	glColor3f(1, 1, 1);
			
 
				-	glBegin(GL_LINES);
			
 
				-	glVertex2f(-i, -i);
			
 
				-	glVertex2f(i, i);
			
 
				-	glEnd();
			
 
				-	glFinish();
			
 
				-	glutPostRedisplay();
			
 
				-}
			
 
				-
			
 
				-static int cuda_devices[] = { 0 };
			
 
				-static struct starpu_driver drivers[] = {
			
 
				-	{ .type = STARPU_CUDA_WORKER }
			
 
				-};
			
 
				-
			
 
				-void callback_func(void *foo) {
			
 
				-	printf("Callback running, rendering\n");
			
 
				-	float i = 1.;
			
 
				-	while (i > 0) {
			
 
				-		usleep(100000);
			
 
				-		display(i);
			
 
				-		i -= 0.1;
			
 
				-	}
			
 
				-	printf("rendering done\n");
			
 
				-
			
 
				-	/* Tell it was already the last submitted task */
			
 
				-	starpu_drivers_request_termination();
			
 
				-
			
 
				-	/* And terminate StarPU */
			
 
				-	starpu_driver_deinit(&drivers[0]);
			
 
				-	starpu_shutdown();
			
 
				-	exit(0);
			
 
				-}
			
 
				-
			
 
				-static void idle(void)
			
 
				-{
			
 
				-	starpu_driver_run_once(&drivers[0]);
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
			
 
				-	return 77;
			
 
				-#else
			
 
				-	struct starpu_conf conf;
			
 
				-	int ret;
			
 
				-	struct starpu_task *task;
			
 
				-	starpu_data_handle_t handle;
			
 
				-	int cuda_device = 0;
			
 
				-
			
 
				-	cuda_devices[0] = cuda_device;
			
 
				-	drivers[0].id.cuda_id = cuda_device;
			
 
				-
			
 
				-	glutInit(&argc, argv);
			
 
				-	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
			
 
				-	glutInitWindowPosition(0, 0);
			
 
				-	glutInitWindowSize(300,200);
			
 
				-	glutCreateWindow("StarPU OpenGL interoperability test");
			
 
				-	glClearColor (0.5, 0.5, 0.5, 0.0);
			
 
				-
			
 
				-	/* Enable OpenGL interoperability */
			
 
				-	starpu_conf_init(&conf);
			
 
				-	conf.ncuda = 1;
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.nopencl = 0;
			
 
				-	conf.cuda_opengl_interoperability = cuda_devices;
			
 
				-	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
			
 
				-	conf.not_launched_drivers = drivers;
			
 
				-	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
			
 
				-	ret = starpu_init(&conf);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
			
 
				-
			
 
				-	/* Submit just one dumb task */
			
 
				-	task = starpu_task_create();
			
 
				-	task->cl = &cl;
			
 
				-	task->handles[0] = handle;
			
 
				-	task->callback_func = callback_func;
			
 
				-	task->callback_arg = NULL;
			
 
				-	ret = starpu_task_submit(task);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-
			
 
				-	/* And run the driver inside main, which will run the task */
			
 
				-	printf("running the driver\n");
			
 
				-	/* Initialize it */
			
 
				-	starpu_driver_init(&drivers[0]);
			
 
				-	/* Register driver loop content as idle handler */
			
 
				-	glutIdleFunc(idle);
			
 
				-	/* Now run the glut loop */
			
 
				-	glutMainLoop();
			
 
				-	/* And deinitialize driver */
			
 
				-	starpu_driver_deinit(&drivers[0]);
			
 
				-	printf("finished running the driver\n");
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-#endif
			
 
				-}
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@
 
				 #include <string.h>
			
 
				 #include <math.h>
			
 
				 #include <sys/time.h>
			
 
				-#include <starpu.h>
			
 
				+/* for STARPU_USE_CUDA */
			
 
				+#include <starpu_config.h>
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
@@ -31,6 +32,8 @@
 
				 
			
 
				 #include "../common/blas.h"
			
 
				 
			
 
				+#include <starpu.h>
			
 
				+
			
 
				 #include "lu_kernels_model.h"
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -135,9 +135,9 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 
				 					right, ld12, 1.0f, center, ld22);
			
 
				 			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				-				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+				STARPU_ABORT();
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaThreadSynchronize();
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -200,9 +200,9 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 
				 					1.0f, sub11, ld11, sub12, ld12);
			
 
				 			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				-				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+				STARPU_ABORT();
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaThreadSynchronize();
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -262,9 +262,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 
				 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				-				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+				STARPU_ABORT();
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaThreadSynchronize();
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -344,8 +344,8 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				float pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+				cudaStreamSynchronize(0);
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0f);
			
 
				 				
			
@@ -357,7 +357,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 
				 								&sub11[(z+1) + (z+1)*ld],ld);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaThreadSynchronize();
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,6 +28,7 @@
 
				 #include <pthread.h>
			
 
				 #include <signal.h>
			
 
				 
			
 
				+#include <starpu_config.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -788,7 +788,6 @@ int main(int argc, char **argv)
 
				 		if (check)
			
 
				 			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
			
 
				 
			
 
				-		starpu_helper_cublas_shutdown();
			
 
				 		starpu_shutdown();
			
 
				 		free_system(A, B, newsize, pinned);
			
 
				 	}
			
--- a/examples/heat/heat.h
+++ b/examples/heat/heat.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,6 +25,7 @@
 
				 #include <math.h>
			
 
				 
			
 
				 /* needed for STARPU_OPENGL_RENDER */
			
 
				+#include <starpu_config.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				 #include <common/blas.h>
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -27,6 +27,7 @@ extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				 extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				 struct starpu_opencl_program opencl_program;
			
 
				 #endif
			
@@ -94,7 +95,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->handles[0] = float_array_handle;
			
 
				 
			
 
				-		ret = starpu_task_submit(task);
			
 
				+		int ret = starpu_task_submit(task);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 		{
			
 
				 			FPRINTF(stderr, "No worker may execute this task\n");
			
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,12 +16,13 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_incrementer(float * tab)
			
 
				 {
			
 
				 	tab[0] = tab[0] + 1.0f;
			
 
				 	tab[2] = tab[2] + 1.0f;
			
 
				-
			
 
				+	
			
 
				 	return;
			
 
				 }
			
 
				 
			
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 void opencl_codelet(void *descr[], void *_args)
			
--- a/examples/interface/complex.c
+++ b/examples/interface/complex.c
@@ -16,7 +16,9 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include "complex_interface.h"
			
 
				-#include "complex_codelet.h"
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern void copy_complex_codelet_cuda(void *descr[], __attribute__ ((unused)) void *_args);
			
@@ -25,6 +27,52 @@ extern void copy_complex_codelet_cuda(void *descr[], __attribute__ ((unused)) vo
 
				 extern void copy_complex_codelet_opencl(void *buffers[], void *args);
			
 
				 #endif
			
 
				 
			
 
				+void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				+	double *real1 = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				+	double *imaginary1 = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				+
			
 
				+	int nx2 = STARPU_COMPLEX_GET_NX(descr[1]);
			
 
				+	double *real2 = STARPU_COMPLEX_GET_REAL(descr[1]);
			
 
				+	double *imaginary2 = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
			
 
				+
			
 
				+	int compare = (nx1 == nx2);
			
 
				+	if (nx1 == nx2)
			
 
				+	{
			
 
				+		int i;
			
 
				+		for(i=0 ; i<nx1 ; i++)
			
 
				+		{
			
 
				+			if (real1[i] != real2[i] || imaginary1[i] != imaginary2[i])
			
 
				+			{
			
 
				+				compare = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	fprintf(stderr, "Complex numbers are%s similar\n", compare==0 ? " NOT" : "");
			
 
				+}
			
 
				+
			
 
				+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	int nx = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				+	double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				+	double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				+	int i;
			
 
				+
			
 
				+	for(i=0 ; i<nx ; i++)
			
 
				+	{
			
 
				+		fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl_display =
			
 
				+{
			
 
				+	.cpu_funcs = {display_complex_codelet, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				 struct starpu_codelet cl_copy =
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -37,6 +85,12 @@ struct starpu_codelet cl_copy =
 
				 	.modes = {STARPU_R, STARPU_W}
			
 
				 };
			
 
				 
			
 
				+struct starpu_codelet cl_compare =
			
 
				+{
			
 
				+	.cpu_funcs = {compare_complex_codelet, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_R}
			
 
				+};
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 struct starpu_opencl_program opencl_program;
			
--- a/examples/interface/complex_codelet.h
+++ b/examples/interface/complex_codelet.h
@@ -1,76 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include "complex_interface.h"
			
 
				-
			
 
				-#ifndef __COMPLEX_CODELET_H
			
 
				-#define __COMPLEX_CODELET_H
			
 
				-
			
 
				-void compare_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				-{
			
 
				-	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				-	double *real1 = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				-	double *imaginary1 = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				-
			
 
				-	int nx2 = STARPU_COMPLEX_GET_NX(descr[1]);
			
 
				-	double *real2 = STARPU_COMPLEX_GET_REAL(descr[1]);
			
 
				-	double *imaginary2 = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
			
 
				-
			
 
				-	int compare = (nx1 == nx2);
			
 
				-	if (nx1 == nx2)
			
 
				-	{
			
 
				-		int i;
			
 
				-		for(i=0 ; i<nx1 ; i++)
			
 
				-		{
			
 
				-			if (real1[i] != real2[i] || imaginary1[i] != imaginary2[i])
			
 
				-			{
			
 
				-				compare = 0;
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	fprintf(stderr, "Complex numbers are%s similar\n", compare==0 ? " NOT" : "");
			
 
				-}
			
 
				-
			
 
				-struct starpu_codelet cl_compare =
			
 
				-{
			
 
				-	.cpu_funcs = {compare_complex_codelet, NULL},
			
 
				-	.nbuffers = 2,
			
 
				-	.modes = {STARPU_R, STARPU_R}
			
 
				-};
			
 
				-
			
 
				-void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				-{
			
 
				-	int nx = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				-	double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				-	double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				-	int i;
			
 
				-
			
 
				-	for(i=0 ; i<nx ; i++)
			
 
				-	{
			
 
				-		fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-struct starpu_codelet cl_display =
			
 
				-{
			
 
				-	.cpu_funcs = {display_complex_codelet, NULL},
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_R}
			
 
				-};
			
 
				-
			
 
				-#endif /* __COMPLEX_CODELET_H */
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				 #include "complex_interface.h"
			
 
				 
			
@@ -134,7 +137,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 
				 		}
			
 
				 #endif
			
 
				 		default:
			
 
				-			STARPU_ABORT();
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	if (fail)
			
@@ -161,89 +164,45 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 
				 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				-static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	return (void*) complex_interface->real;
			
 
				-}
			
 
				-
			
 
				-static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	*ptr = malloc(complex_get_size(handle));
			
 
				-	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
			
 
				-	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
			
 
				-	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				 
			
 
				-	cudaStream_t sstream = stream;
			
 
				-	int ret;
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
			
 
				-					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				+	cures = cudaMemcpy((void *)dst_complex->real, (void *)src_complex->real, src_complex->nx*sizeof(src_complex->real[0]), kind);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
			
 
				-					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
			
 
				-	return ret;
			
 
				-}
			
 
				+	cures = cudaMemcpy((char *)dst_complex->imaginary, (char *)src_complex->imaginary, src_complex->nx*sizeof(src_complex->imaginary[0]), kind);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				+                              void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				+
			
 
				 	cl_int err;
			
 
				-	int ret;
			
 
				 
			
 
				 	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
			
 
				 					       src_node,
			
@@ -251,12 +210,10 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void
 
				 					       dst_node,
			
 
				 					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				 					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	if (ret == 0)
			
 
				-		event = NULL;
			
 
				 
			
 
				 	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
			
 
				 					       src_node,
			
@@ -264,38 +221,31 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void
 
				 					       dst_node,
			
 
				 					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				 					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-	cl_int err;
			
 
				-	int ret;
			
 
				 
			
 
				+	cl_int err;
			
 
				 	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
			
 
				 					       src_node,
			
 
				 					       dst_complex->real,
			
 
				 					       dst_node,
			
 
				 					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				 					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	if (ret == 0)
			
 
				-		event = NULL;
			
 
				 
			
 
				 	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
			
 
				 					       src_node,
			
@@ -303,33 +253,23 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void
 
				 					       dst_node,
			
 
				 					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				 					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				+					       NULL,
			
 
				+					       NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif
			
 
				-
			
 
				 static struct starpu_data_copy_methods complex_copy_methods =
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
 
				 	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				 };
			
 
				 
			
@@ -342,9 +282,6 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 	.footprint = complex_footprint,
			
 
				 	.interfaceid = -1,
			
 
				 	.interface_size = sizeof(struct starpu_complex_interface),
			
 
				-	.handle_to_pointer = complex_handle_to_pointer,
			
 
				-	.pack_data = complex_pack_data,
			
 
				-	.unpack_data = complex_unpack_data
			
 
				 };
			
 
				 
			
 
				 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)
			
--- a/examples/interface/complex_interface.h
+++ b/examples/interface/complex_interface.h
@@ -16,9 +16,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifndef __COMPLEX_INTERFACE_H
			
 
				-#define __COMPLEX_INTERFACE_H
			
 
				-
			
 
				 /* interface for complex numbers */
			
 
				 struct starpu_complex_interface
			
 
				 {
			
@@ -37,4 +34,3 @@ int starpu_complex_get_nx(starpu_data_handle_t handle);
 
				 #define STARPU_COMPLEX_GET_IMAGINARY(interface)	(((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				 #define STARPU_COMPLEX_GET_NX(interface)	(((struct starpu_complex_interface *)(interface))->nx)
			
 
				 
			
 
				-#endif /* __COMPLEX_INTERFACE_H */
			
--- a/examples/interface/complex_kernels.cu
+++ b/examples/interface/complex_kernels.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include "complex_interface.h"
			
 
				 
			
 
				 static __global__ void complex_copy_cuda(double *o_real, double *o_imaginary, double *i_real, double *i_imaginary, unsigned n)
			
--- a/examples/interface/complex_kernels_opencl.c
+++ b/examples/interface/complex_kernels_opencl.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				 #include "complex_interface.h"
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
--- a/examples/lu/clu.c
+++ b/examples/lu/clu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/clu_implicit.c
+++ b/examples/lu/clu_implicit.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/clu_implicit_pivot.c
+++ b/examples/lu/clu_implicit_pivot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/clu_kernels.c
+++ b/examples/lu/clu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/clu_pivot.c
+++ b/examples/lu/clu_pivot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/dlu.c
+++ b/examples/lu/dlu.c
@@ -15,5 +15,5 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "lu-double.h"
			
 
				+#include "double.h"
			
 
				 #include "xlu.c"
			
--- a/examples/lu/dlu_implicit.c
+++ b/examples/lu/dlu_implicit.c
@@ -15,5 +15,5 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "lu-double.h"
			
 
				+#include "double.h"
			
 
				 #include "xlu_implicit.c"
			
--- a/examples/lu/dlu_implicit_pivot.c
+++ b/examples/lu/dlu_implicit_pivot.c
@@ -15,5 +15,5 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "lu-double.h"
			
 
				+#include "double.h"
			
 
				 #include "xlu_implicit_pivot.c"
			
--- a/examples/lu/dlu_kernels.c
+++ b/examples/lu/dlu_kernels.c
@@ -15,5 +15,5 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "lu-double.h"
			
 
				+#include "double.h"
			
 
				 #include "xlu_kernels.c"
			
--- a/examples/lu/dlu_pivot.c
+++ b/examples/lu/dlu_pivot.c
@@ -15,5 +15,5 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "lu-double.h"
			
 
				+#include "double.h"
			
 
				 #include "xlu_pivot.c"
			
--- a/examples/lu/lu-double.h
+++ b/examples/lu/lu-double.h
--- a/examples/lu/lu-float.h
+++ b/examples/lu/lu-float.h
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -21,6 +21,8 @@
 
				 #include <time.h>
			
 
				 #include <math.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+#include <starpu_bound.h>
			
 
				 #include "xlu.h"
			
 
				 #include "xlu_kernels.h"
			
 
				 
			
--- a/examples/lu/lu_example_complex_double.c
+++ b/examples/lu/lu_example_complex_double.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/lu/lu_example_complex_float.c
+++ b/examples/lu/lu_example_complex_float.c