vor 13 Jahren · 38e00e9d11
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@
 
				 /config.status
			
 
				 /autom4te.cache
			
 
				 /libtool
			
 
				-/libstarpu.pc
			
 
				 /aclocal.m4
			
 
				 /build-aux
			
 
				 /GPATH
			
@@ -186,3 +185,66 @@ starpu.log
 
				 /gcc-plugin/tests/lib-user
			
 
				 /gcc-plugin/examples/matrix-mult
			
 
				 /gcc-plugin/src/c-expr.c
			
 
				+/gcc-plugin/tests/heap-allocated
			
 
				+/gcc-plugin/tests/output-pointer
			
 
				+/gcc-plugin/examples/vector_scal/vector_scal
			
 
				+/doc/starpu.info-1
			
 
				+/doc/starpu.info-2
			
 
				+/examples/axpy/axpy
			
 
				+/examples/basic_examples/mult_impl
			
 
				+/examples/basic_examples/multiformat
			
 
				+/examples/cg/cg
			
 
				+/examples/cholesky/cholesky_grain_tag
			
 
				+/examples/cholesky/cholesky_implicit
			
 
				+/examples/cholesky/cholesky_tag
			
 
				+/examples/cholesky/cholesky_tile_tag
			
 
				+/examples/cpp/incrementer_cpp
			
 
				+/examples/filters/custom_mf/custom_mf_filter
			
 
				+/examples/filters/multiformat/multiformat_filter
			
 
				+/examples/heat/heat
			
 
				+/examples/lu/lu_example_double
			
 
				+/examples/lu/lu_example_float
			
 
				+/examples/lu/lu_implicit_example_double
			
 
				+/examples/lu/lu_implicit_example_float
			
 
				+/examples/mult/dgemm
			
 
				+/examples/mult/sgemm
			
 
				+/mpi/starpumpi-1.0.pc
			
 
				+/socl/socl-1.0.pc
			
 
				+/starpufft/starpufft-1.0.pc
			
 
				+/tests/core/deprecated
			
 
				+/tests/core/deprecated_buffer
			
 
				+/tests/core/deprecated_func
			
 
				+/tests/core/multiformat_data_release
			
 
				+/tests/core/multiformat_handle_conversion
			
 
				+/tests/core/starpu_init
			
 
				+/tests/core/starpu_task_bundle
			
 
				+/tests/core/starpu_worker_exists
			
 
				+/tests/datawizard/copy
			
 
				+/tests/datawizard/double_parameter
			
 
				+/tests/datawizard/gpu_register
			
 
				+/tests/datawizard/in_place_partition
			
 
				+/tests/datawizard/increment_redux_lazy
			
 
				+/tests/datawizard/interfaces/bcsr/bcsr_interface
			
 
				+/tests/datawizard/interfaces/block/block_interface
			
 
				+/tests/datawizard/interfaces/csr/csr_interface
			
 
				+/tests/datawizard/interfaces/matrix/matrix_interface
			
 
				+/tests/datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl
			
 
				+/tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release
			
 
				+/tests/datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion
			
 
				+/tests/datawizard/interfaces/multiformat/advanced/multiformat_worker
			
 
				+/tests/datawizard/interfaces/multiformat/advanced/same_handle
			
 
				+/tests/datawizard/interfaces/multiformat/multiformat_interface
			
 
				+/tests/datawizard/interfaces/test_interfaces
			
 
				+/tests/datawizard/interfaces/test_vector_interface
			
 
				+/tests/datawizard/interfaces/variable/variable_interface
			
 
				+/tests/datawizard/interfaces/vector/test_vector_interface
			
 
				+/tests/datawizard/interfaces/void/void_interface
			
 
				+/tests/datawizard/partition_lazy
			
 
				+/tests/loader
			
 
				+/tests/starpu_machine_display
			
 
				+/tools/starpu_calibrate_bus.1
			
 
				+/tools/starpu_machine_display.1
			
 
				+/tools/starpu_perfmodel_display.1
			
 
				+/tools/starpu_perfmodel_plot.1
			
 
				+/starpu-1.0.pc
			
 
				+/gcc-plugin/examples/cholesky/cholesky
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -12,3 +12,5 @@ Jean-Marie Couteyen <jm.couteyen@gmail.com>
 
				 Anthony Roy <theanthony33@gmail.com>
			
 
				 David Gómez <david_gomez1380@yahoo.com.mx>
			
 
				 Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
			
 
				+Antoine Lucas <antoine.lucas.33@gmail.com>
			
 
				+Pierre André Wacrenier <wacrenier@labri.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,87 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+StarPU 1.0 (svn revision xxxx)
			
 
				+==============================================
			
 
				+The extensions-again release
			
 
				+
			
 
				+  * struct starpu_data_interface_ops --- operations on a data
			
 
				+        interface --- define a new function pointer allocate_new_data
			
 
				+        which creates a new data interface of the given type based on
			
 
				+        an existing handle
			
 
				+  * Make environment variables take precedence over the configuration
			
 
				+        passed to starpu_init()
			
 
				+  * Add man pages for some of the tools
			
 
				+  * Add reduction mode to starpu_mpi_insert_task
			
 
				+  * Add C++ application example in examples/cpp/
			
 
				+  * Increase default value for STARPU_MAXCPUS -- Maximum number of
			
 
				+        CPUs supported -- to 64.
			
 
				+  * Libtool interface versioning has been included in libraries names
			
 
				+        (libstarpu-1.0.so, libstarpumpi-1.0.so,
			
 
				+        libstarpufft-1.0.so, libsocl-1.0.so)
			
 
				+  * Enable by default the SOCL extension.
			
 
				+  * Enable by default the GCC plug-in extension.
			
 
				+  * Add a field named magic to struct starpu_task which is set when
			
 
				+        initialising the task. starpu_task_submit will fail if the
			
 
				+        field does not have the right value. This will hence avoid
			
 
				+        submitting tasks which have not been properly initialised.
			
 
				+  * Make where field for struct starpu_codelet optional. When unset, its
			
 
				+	value will be automatically set based on the availability of the
			
 
				+	different XXX_funcs fields of the codelet.
			
 
				+  * Add a hook function pre_exec_hook in struct starpu_sched_policy.
			
 
				+        The function is meant to be called in drivers. Schedulers
			
 
				+        can use it to be notified when a task is about being computed.
			
 
				+  * Define access modes for data handles into starpu_codelet and no longer
			
 
				+	in starpu_task. Hence mark (struct starpu_task).buffers as
			
 
				+	deprecated, and add (struct starpu_task).handles and (struct
			
 
				+	starpu_codelet).modes
			
 
				+  * Install headers under $includedir/starpu/1.0.
			
 
				+  * Deprecate cost_model, and introduce cost_function, which is provided
			
 
				+	with the whole task structure, the target arch and implementation
			
 
				+	number
			
 
				+  * Permit the application to provide its own size base for performance
			
 
				+	models
			
 
				+  * Fields xxx_func of struct starpu_codelet are made deprecated. One
			
 
				+	should use instead fields xxx_funcs.
			
 
				+  * Applications can provide several implementations of a codelet for the
			
 
				+	same architecture.
			
 
				+  * A new multi-format interface permits to use different binary formats
			
 
				+	on CPUs & GPUs, the conversion functions being provided by the
			
 
				+	application and called by StarPU as needed (and as less as
			
 
				+	possible).
			
 
				+  * Add a gcc plugin to extend the C interface with pragmas which allows to
			
 
				+	easily define codelets and issue tasks.
			
 
				+  * Add codelet execution time statistics plot.
			
 
				+  * Add bus speed in starpu_machine_display.
			
 
				+  * Add a StarPU-Top feedback and steering interface.
			
 
				+  * Documentation improvement.
			
 
				+  * Add a STARPU_DATA_ACQUIRE_CB which permits to inline the code to be
			
 
				+	done.
			
 
				+  * Permit to specify MPI tags for more efficient starpu_mpi_insert_task
			
 
				+  * Add SOCL, an OpenCL interface on top of StarPU.
			
 
				+  * Add gdb functions.
			
 
				+  * Add complex support to LU example.
			
 
				+  * Add an OpenMP fork-join example.
			
 
				+  * Permit to use the same data several times in write mode in the
			
 
				+	parameters of the same task.
			
 
				+  * Some types were renamed for consistency. The tools/dev/rename.sh
			
 
				+	script can be used to port code using former names. You can also
			
 
				+	choose to include starpu_deprecated_api.h (after starpu.h) to keep
			
 
				+	using the old types.
			
 
				+
			
 
				 StarPU 0.9 (svn revision 3721)
			
 
				 ==============================================
			
 
				 The extensions release
			
@@ -58,7 +142,7 @@ The asynchronous heterogeneous multi-accelerator release
 
				     - Implement starpu_worker_get_count
			
 
				     - Implement starpu_display_codelet_stats
			
 
				     - Implement starpu_data_prefetch_on_node
			
 
				-    - Expose the starpu_data_set_wb_mask function
			
 
				+    - Expose the starpu_data_set_wt_mask function
			
 
				   * Support nvidia (heterogeneous) multi-GPU
			
 
				   * Add the data request mechanism
			
 
				     - All data transfers use data requests now
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -34,7 +34,6 @@ if COND_OPT
 
				 SUBDIRS += tests/opt examples/opt
			
 
				 endif
			
 
				 
			
 
				-
			
 
				 if BUILD_GCC_PLUGIN
			
 
				 SUBDIRS += gcc-plugin
			
 
				 endif
			
@@ -43,12 +42,16 @@ if BUILD_SCHED_CTX_HYPERVISOR
 
				 SUBDIRS += sched_ctx_hypervisor
			
 
				 endif
			
 
				 
			
 
				+if BUILD_STARPUFFT
			
 
				+SUBDIRS += starpufft
			
 
				+endif
			
 
				+
			
 
				 pkgconfigdir = $(libdir)/pkgconfig
			
 
				-pkgconfig_DATA = libstarpu.pc
			
 
				+pkgconfig_DATA = libstarpu.pc starpu-1.0.pc
			
 
				 
			
 
				-include_HEADERS = 				\
			
 
				+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
			
 
				+versinclude_HEADERS = 				\
			
 
				 	include/starpu.h			\
			
 
				-	include/starpu_config.h			\
			
 
				 	include/starpu_data_filters.h		\
			
 
				 	include/starpu_data_interfaces.h	\
			
 
				 	include/starpu_task.h			\
			
@@ -57,13 +60,19 @@ include_HEADERS = 				\
 
				 	include/starpu_data.h			\
			
 
				 	include/starpu_perfmodel.h		\
			
 
				 	include/starpu_util.h			\
			
 
				+	include/starpu_fxt.h			\
			
 
				 	include/starpu_cuda.h			\
			
 
				 	include/starpu_opencl.h			\
			
 
				 	include/starpu_expert.h			\
			
 
				 	include/starpu_profiling.h		\
			
 
				 	include/starpu_bound.h			\
			
 
				 	include/starpu_scheduler.h		\
			
 
				-	include/starpu_top.h
			
 
				+	include/starpu_top.h			\
			
 
				+	include/starpu_deprecated_api.h         \
			
 
				+	include/starpu_hash.h
			
 
				+
			
 
				+nodist_versinclude_HEADERS = 			\
			
 
				+	include/starpu_config.h
			
 
				 
			
 
				 if BUILD_STARPU_TOP
			
 
				 all-local:
			
@@ -86,6 +95,11 @@ else
 
				 txtdir = ${docdir}
			
 
				 endif
			
 
				 txt_DATA = AUTHORS COPYING.LGPL README
			
 
				-EXTRA_DIST = AUTHORS COPYING.LGPL README
			
 
				+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION
			
 
				 
			
 
				 include starpu-top/extradist
			
 
				+
			
 
				+showcheck:
			
 
				+	for i in $(SUBDIRS) ; do \
			
 
				+		make -C $$i showcheck ; \
			
 
				+	done
			
--- a/README
+++ b/README
@@ -1,3 +1,19 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				 ++=================++
			
 
				 || I. Introduction ||
			
 
				 ++=================++
			
@@ -134,7 +150,7 @@ Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 
				 ++==============++
			
 
				 
			
 
				 To upgrade your source code from older version (there were quite a few
			
 
				-renamings), use the tools/rename.sh script
			
 
				+renamings), use the tools/dev/rename.sh script
			
 
				 
			
 
				 ++===============++
			
 
				 || VIII. Contact ||
			
--- a/README.dev
+++ b/README.dev
@@ -1,169 +1,53 @@
 
				-Installing StarPU on windows
			
 
				-----------------------------
			
 
				-
			
 
				-If you are building from a tarball downloaded from the website, you can skip the
			
 
				-cygwin part.
			
 
				-
			
 
				-1. Install cygwin
			
 
				-
			
 
				-http://cygwin.com/install.html
			
 
				-
			
 
				-Make sure the following packages are available:
			
 
				-- (Devel)/subversion
			
 
				-- (Devel)/libtool
			
 
				-- (Devel)/gcc
			
 
				-- (Devel)/make
			
 
				-- your favorite editor (vi, emacs, ...)
			
 
				-- (Devel)/gdb
			
 
				-- (Archive)/zip
			
 
				-- (Devel)/pkg-config
			
 
				-
			
 
				-2. Install mingw
			
 
				-
			
 
				-http://sourceforge.net/projects/mingw/
			
 
				-
			
 
				-3. Install hwloc (not mandatory)
			
 
				-
			
 
				-http://www.open-mpi.org/projects/hwloc
			
 
				-
			
 
				-4. Install Microsoft Visual C++ Studio Express
			
 
				-
			
 
				-   http://www.microsoft.com/express/Downloads
			
 
				-
			
 
				-   Add in your path the following directories.
			
 
				-   (adjusting where necessary for the Installation location according to VC
			
 
				-    version and on 64 and 32bit Windows versions)
			
 
				-
			
 
				-   On cygwin, with Visual C++ 2010 e.g.;
			
 
				-
			
 
				-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
			
 
				-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
			
 
				-
			
 
				-   On MingW, with Visual C++ 2010, e.g.;
			
 
				-
			
 
				-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
			
 
				-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
			
 
				-
			
 
				-   Try to call <lib.exe> and <link.exe> without any option to make sure these
			
 
				-   dump their help output, else no .def or .lib file will be produced.
			
 
				-
			
 
				-5. Install GPU Drivers (not mandatory)
			
 
				-
			
 
				-  5.1 Install Cuda
			
 
				-
			
 
				-      http://developer.nvidia.com/object/cuda_3_2_downloads.html
			
 
				-
			
 
				-      You need to install at least the CUDA toolkit.
			
 
				-
			
 
				-      libtool is not able to find the libraries automatically, you
			
 
				-      need to make some copies:
			
 
				-
			
 
				-      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
			
 
				-      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
			
 
				-      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
			
 
				-      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
			
 
				-      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
			
 
				-
			
 
				-      (and if the version of your CUDA driver is >= 3.2)
			
 
				-
			
 
				-      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
			
 
				-
			
 
				-      Add the CUDA bin directory in your path
			
 
				-
			
 
				-      export PATH=/cygdrive/c/CUDA/bin:$PATH
			
 
				-
			
 
				-      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
			
 
				-      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
			
 
				-      definition which makes linking fail completely. Replace the first
			
 
				-      occurence of
			
 
				-
			
 
				-      #define CUDARTAPI
			
 
				-
			
 
				-      with
			
 
				-
			
 
				-      #ifdef _WIN32
			
 
				-      #define CUDARTAPI __stdcall
			
 
				-      #else
			
 
				-      #define CUDARTAPI
			
 
				-      #endif
			
 
				-
			
 
				-      While at it, you can also comment the __cdecl definition to avoid spurious
			
 
				-      warnings.
			
 
				-
			
 
				-
			
 
				-  5.2 Install OpenCL
			
 
				-
			
 
				-      http://developer.nvidia.com/object/opencl-download.html
			
 
				-
			
 
				-      You need to download the NVIDIA Drivers for your version of
			
 
				-      Windows. Executing the file will extract all files in a given
			
 
				-      directory. The the driver installation will start, it will fail
			
 
				-      if no compatibles drivers can be found on your system.
			
 
				-
			
 
				-      Anyway, you should copy the *.dl_ files from the directory
			
 
				-      (extraction path) in the bin directory of the CUDA installation
			
 
				-      directory (the directory should be v3.2/bin/)
			
 
				-
			
 
				-  5.3 Install MsCompress
			
 
				-
			
 
				-      http://gnuwin32.sourceforge.net/packages/mscompress.htm
			
 
				-
			
 
				-      Go in the CUDA bin directory, uncompress .dl_ files and rename
			
 
				-      them in .dll files
			
 
				-
			
 
				-      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
			
 
				-      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
			
 
				-
			
 
				-If you are building from a tarball downloaded from the website, you can skip the
			
 
				-autogen.sh part.
			
 
				-
			
 
				-6. Start autogen.sh from cygwin
			
 
				-
			
 
				-   cd starpu-trunk
			
 
				-   ./autogen.sh
			
 
				-
			
 
				-7. Start a MinGW shell
			
 
				-
			
 
				-   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
			
 
				-
			
 
				-8. Configure, make, install from MinGW
			
 
				-
			
 
				-   If you have a non-english version of windows, use
			
 
				-
			
 
				-     export LANG=C
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+Contents
			
 
				+========
			
 
				+
			
 
				+- Developer Warnings
			
 
				+- Naming Conventions
			
 
				+- Coding Style
			
 
				+
			
 
				+Developer Warnings
			
 
				+------------------
			
 
				 
			
 
				-   else libtool has troubles parsing the translated output of the toolchain.
			
 
				+They are enabled only if the STARPU_DEVEL environment variable is
			
 
				+defined to a non-empty value, when calling configure.
			
 
				 
			
 
				-   cd starpu-trunk
			
 
				-   mkdir build
			
 
				-   cd build
			
 
				-   ../configure --prefix=$PWD/target --disable-default-drand48 \
			
 
				-        --with-hwloc=<HWLOC installation directory> \
			
 
				-        --with-cuda-dir=<CUDA installation directory> \
			
 
				-        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
			
 
				-	--with-opencl-dir=<CUDA installation directory>
			
 
				-   make
			
 
				-   make install
			
 
				+
			
 
				 
			
 
				-   Also convert a couple of files to CRLF:
			
 
				+Naming Conventions
			
 
				+------------------
			
 
				 
			
 
				-   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
			
 
				-   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
			
 
				-   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
			
 
				+* Prefix names of public objects (types, functions, etc.) with "starpu"
			
 
				 
			
 
				-9. If you want your StarPU installation to be standalone, you need to
			
 
				-   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
			
 
				-   installation bin directory, as well as MinGW/bin/libpthread*dll
			
 
				+* Prefix names of internal objects (types, functions, etc.) with "_starpu"
			
 
				 
			
 
				-   cp <CUDA directory>/bin/*dll target/bin
			
 
				-   cp <HWLOC directory>/bin/*dll target/bin
			
 
				-   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
			
 
				+* Names for qualified types (struct, union, enum) do not end with _t, _s or similar.
			
 
				+  Use _t only for typedef types, such as opaque public types, e.g
			
 
				+       typedef struct _starpu_data_state* starpu_data_handle_t;
			
 
				+  or
			
 
				+       typedef uint64_t starpu_tag_t;
			
 
				 
			
 
				-   and set the StarPU bin directory in your path.
			
 
				+* When a variable can only take a finite set of values, use an enum
			
 
				+  type instead of defining macros for each of the values.
			
 
				 
			
 
				-   export PATH=<StarPU installation directory>/bin:$PATH
			
 
				+
			
 
				 
			
 
				+Coding Style
			
 
				+------------
			
 
				 
			
 
				-Developers warning
			
 
				-------------------
			
 
				-They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.
			
 
				+* Curly braces always go on a new line
			
--- a/STARPU-VERSION
+++ b/STARPU-VERSION
@@ -0,0 +1,21 @@
 
				+# -*- sh -*-
			
 
				+
			
 
				+# Versioning (SONAMEs) for StarPU libraries.
			
 
				+
			
 
				+# Libtool interface versioning (info "(libtool) Versioning").
			
 
				+LIBSTARPU_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				+LIBSTARPU_INTERFACE_AGE=0	# set to CURRENT - PREVIOUS interface
			
 
				+STARPU_EFFECTIVE_VERSION=1.0
			
 
				+
			
 
				+LIBSTARPUFFT_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPUFFT_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				+LIBSTARPUFFT_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
 
				+
			
 
				+LIBSTARPUMPI_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSTARPUMPI_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				+LIBSTARPUMPI_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
 
				+
			
 
				+LIBSOCL_INTERFACE_CURRENT=0	# increment upon ABI change
			
 
				+LIBSOCL_INTERFACE_REVISION=0	# increment upon implementation change
			
 
				+LIBSOCL_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
			
--- a/TODO
+++ b/TODO
@@ -0,0 +1,9 @@
 
				+
			
 
				+Moving access modes for data handles from struct starpu_task to struct starpu_codelet
			
 
				+=====================================================================================
			
 
				+
			
 
				+TODO list
			
 
				+
			
 
				+- Make struct starpu_buffer_descr private (or not, as it can still be used in tests and examples)
			
 
				+
			
 
				+- When cost_model is provided, but not cost_function, need to rebuild a struct starpu_buffer_descr
			
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1,95 +0,0 @@
 
				-dnl Copyright (C) Free Software Foundation, Inc.
			
 
				-dnl
			
 
				-dnl This program is free software; you can redistribute it and/or modify
			
 
				-dnl it under the terms of the GNU General Public License as published by
			
 
				-dnl the Free Software Foundation; either version 2 of the License, or
			
 
				-dnl (at your option) any later version.
			
 
				-dnl 
			
 
				-dnl This program is distributed in the hope that it will be useful,
			
 
				-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				-dnl GNU General Public License for more details.
			
 
				-dnl 
			
 
				-dnl You should have received a copy of the GNU General Public License
			
 
				-dnl along with this program; if not, write to the Free Software
			
 
				-dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				-dnl
			
 
				-dnl This test is taken from libgfortran
			
 
				-
			
 
				-dnl Check whether the target supports __sync_val_compare_and_swap.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
			
 
				-		 ac_cv_have_sync_val_compare_and_swap, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
			
 
				-			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
			
 
				-			[ac_cv_have_sync_val_compare_and_swap=yes],
			
 
				-			[ac_cv_have_sync_val_compare_and_swap=no])])
			
 
				-  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
			
 
				-	      [Define to 1 if the target supports __sync_val_compare_and_swap])
			
 
				-  fi])
			
 
				-
			
 
				-dnl Check whether the target supports __sync_bool_compare_and_swap.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_bool_compare_and_swap],
			
 
				-		 ac_cv_have_sync_bool_compare_and_swap, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
			
 
				-			[bar = __sync_bool_compare_and_swap(&foo, 0, 1);])],
			
 
				-			[ac_cv_have_sync_bool_compare_and_swap=yes],
			
 
				-			[ac_cv_have_sync_bool_compare_and_swap=no])])
			
 
				-  if test $ac_cv_have_sync_bool_compare_and_swap = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP, 1,
			
 
				-	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
			
 
				-  fi])
			
 
				-
			
 
				-dnl Check whether the target supports __sync_fetch_and_add.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
			
 
				-		 ac_cv_have_sync_fetch_and_add, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
			
 
				-			[bar = __sync_fetch_and_add(&foo, 1);])],
			
 
				-			[ac_cv_have_sync_fetch_and_add=yes],
			
 
				-			[ac_cv_have_sync_fetch_and_add=no])])
			
 
				-  if test $ac_cv_have_sync_fetch_and_add = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_ADD, 1,
			
 
				-	      [Define to 1 if the target supports __sync_fetch_and_add])
			
 
				-  fi])
			
 
				-
			
 
				-dnl Check whether the target supports __sync_fetch_and_or.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_OR], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_or],
			
 
				-		 ac_cv_have_sync_fetch_and_or, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
			
 
				-			[bar = __sync_fetch_and_or(&foo, 1);])],
			
 
				-			[ac_cv_have_sync_fetch_and_or=yes],
			
 
				-			[ac_cv_have_sync_fetch_and_or=no])])
			
 
				-  if test $ac_cv_have_sync_fetch_and_or = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_OR, 1,
			
 
				-	      [Define to 1 if the target supports __sync_fetch_and_or])
			
 
				-  fi])
			
 
				-
			
 
				-dnl Check whether the target supports __sync_lock_test_and_set.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_LOCK_TEST_AND_SET], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_lock_test_and_set],
			
 
				-		 ac_cv_have_sync_lock_test_and_set, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
			
 
				-			[bar = __sync_lock_test_and_set(&foo, 1);])],
			
 
				-			[ac_cv_have_sync_lock_test_and_set=yes],
			
 
				-			[ac_cv_have_sync_lock_test_and_set=no])])
			
 
				-  if test $ac_cv_have_sync_lock_test_and_set = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_LOCK_TEST_AND_SET, 1,
			
 
				-	      [Define to 1 if the target supports __sync_lock_test_and_set])
			
 
				-  fi])
			
 
				-
			
 
				-dnl Check whether the target supports __sync_synchronize.
			
 
				-AC_DEFUN([STARPU_CHECK_SYNC_SYNCHRONIZE], [
			
 
				-  AC_CACHE_CHECK([whether the target supports __sync_synchronize],
			
 
				-		 ac_cv_have_sync_synchronize, [
			
 
				-  AC_LINK_IFELSE([AC_LANG_PROGRAM(,
			
 
				-			[__sync_synchronize();])],
			
 
				-			[ac_cv_have_sync_synchronize=yes],
			
 
				-			[ac_cv_have_sync_synchronize=no])])
			
 
				-  if test $ac_cv_have_sync_synchronize = yes; then
			
 
				-    AC_DEFINE(STARPU_HAVE_SYNC_SYNCHRONIZE, 1,
			
 
				-	      [Define to 1 if the target supports __sync_synchronize])
			
 
				-  fi])
			
--- a/configure.ac
+++ b/configure.ac
@@ -1,9 +1,9 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				-# Copyright (C) 2011  INRIA
			
 
				+# Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,20 +16,51 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AC_INIT([StarPU],0.9.2, [starpu-bugs@lists.gforge.inria.fr], starpu)
			
 
				+AC_INIT([StarPU],1.0.0rc2, [starpu-devel@lists.gforge.inria.fr], starpu)
			
 
				 AC_CONFIG_SRCDIR(include/starpu.h)
			
 
				 AC_CONFIG_AUX_DIR([build-aux])
			
 
				+
			
 
				+dnl Versioning.
			
 
				+
			
 
				+STARPU_MAJOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 1`"
			
 
				+STARPU_MINOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 2`"
			
 
				+AC_SUBST([STARPU_MAJOR_VERSION])
			
 
				+AC_SUBST([STARPU_MINOR_VERSION])
			
 
				+AC_SUBST([STARPU_EFFECTIVE_VERSION])
			
 
				+AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION],
			
 
				+  [Major version number of StarPU.])
			
 
				+AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION],
			
 
				+  [Major version number of StarPU.])
			
 
				+
			
 
				+. "$srcdir/STARPU-VERSION"
			
 
				+AC_SUBST([LIBSTARPU_INTERFACE_CURRENT])
			
 
				+AC_SUBST([LIBSTARPU_INTERFACE_REVISION])
			
 
				+AC_SUBST([LIBSTARPU_INTERFACE_AGE])
			
 
				+AC_SUBST([LIBSTARPUMPI_INTERFACE_CURRENT])
			
 
				+AC_SUBST([LIBSTARPUMPI_INTERFACE_REVISION])
			
 
				+AC_SUBST([LIBSTARPUMPI_INTERFACE_AGE])
			
 
				+AC_SUBST([LIBSTARPUFFT_INTERFACE_CURRENT])
			
 
				+AC_SUBST([LIBSTARPUFFT_INTERFACE_REVISION])
			
 
				+AC_SUBST([LIBSTARPUFFT_INTERFACE_AGE])
			
 
				+AC_SUBST([LIBSOCL_INTERFACE_CURRENT])
			
 
				+AC_SUBST([LIBSOCL_INTERFACE_REVISION])
			
 
				+AC_SUBST([LIBSOCL_INTERFACE_AGE])
			
 
				+
			
 
				 AC_CANONICAL_SYSTEM
			
 
				 
			
 
				 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
			
 
				 dnl when they're available.
			
 
				 m4_ifdef([AM_SILENT_RULES],
			
 
				-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests])],
			
 
				+  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
			
 
				   [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
			
 
				 
			
 
				+m4_ifdef([AM_SILENT_RULES],
			
 
				+  [AM_SILENT_RULES(yes)])
			
 
				+
			
 
				 AC_PREREQ(2.60)
			
 
				 
			
 
				 AC_PROG_CC
			
 
				+AC_PROG_CXX
			
 
				 AC_PROG_CPP
			
 
				 AC_PROG_SED
			
 
				 AC_PROG_LN_S
			
@@ -61,13 +92,18 @@ AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
 
				 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
			
 
				 AC_CHECK_SIZEOF([void *])
			
 
				 SIZEOF_VOID_P=$ac_cv_sizeof_void_p
			
 
				-if test x$SIZEOF_VOID_P = x4; then
			
 
				-	case "$target" in
			
 
				-	i386-*darwin*) CFLAGS+=" -march=i686 " ;;
			
 
				-	esac
			
 
				-fi
			
 
				-
			
 
				-
			
 
				+case $SIZEOF_VOID_P in
			
 
				+	4)
			
 
				+		case "$target" in
			
 
				+		i386-*darwin*) CFLAGS+=" -march=i686 " ;;
			
 
				+		esac
			
 
				+		STARPU_MS_LIB_ARCH=X86
			
 
				+		;;
			
 
				+	8)
			
 
				+		STARPU_MS_LIB_ARCH=X64
			
 
				+		;;
			
 
				+esac
			
 
				+AC_SUBST(STARPU_MS_LIB_ARCH)
			
 
				 
			
 
				 # This will be useful for program which use CUDA (and .cubin files) which need
			
 
				 # some path to the CUDA code at runtime.
			
@@ -122,8 +158,14 @@ else
 
				   AC_DEFINE([starpu_erand48_r(xsubi, buffer, result)],[do {*(result) = ((double)(rand()) / RAND_MAX);} while (0);],[erand48_r equivalent function])
			
 
				 fi
			
 
				 
			
 
				+# Some systems do not define strerror_r
			
 
				+AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
			
 
				+
			
 
				+# Some systems do not define unsetenv
			
 
				+AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
			
 
				+
			
 
				 # Define slow machine
			
 
				-AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--disable-slow-machine],
			
 
				+AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--enable-slow-machine],
			
 
				 				   [Lower default values for the testcases run by make check])],
			
 
				 				   enable_slow_machine=$enableval, enable_slow_machine=false)
			
 
				 if  test x$enable_slow_machine = xyes; then
			
@@ -132,6 +174,8 @@ fi
 
				 
			
 
				 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
			
 
				 
			
 
				+AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
			
 
				+
			
 
				 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
			
 
				 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
			
 
				 
			
@@ -198,7 +242,7 @@ AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hyper
 
				 AC_MSG_CHECKING(maximum number of CPUs)
			
 
				 AC_ARG_ENABLE(maxcpus, [AS_HELP_STRING([--enable-maxcpus=<number>],
			
 
				 			[maximum number of CPUs])],
			
 
				-			maxcpus=$enableval, maxcpus=16)
			
 
				+			maxcpus=$enableval, maxcpus=64)
			
 
				 AC_MSG_RESULT($maxcpus)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXCPUS, [$maxcpus], [Maximum number of CPUs supported])
			
 
				 
			
@@ -312,7 +356,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
				     __cuda_include_dir=$2
			
 
				     __cuda_lib_dir=$3
			
 
				 
			
 
				-    if test "$__cuda_dir" != "no" ; then
			
 
				+    if test "$__cuda_dir" != "no" -a "$__cuda_dir" != "" ; then
			
 
				 	AC_MSG_CHECKING(whether CUDA RT is available in $__cuda_dir)
			
 
				     else
			
 
				 	AC_MSG_CHECKING(whether CUDA RT is available)
			
@@ -349,8 +393,8 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
				         if test "$have_valid_cuda" = "no" ; then
			
 
				             if test "$3" = "no" -a "$__cuda_dir" != "no" ; then
			
 
				                 __cuda_lib_dir="$__cuda_dir/lib64"
			
 
				+		LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
			
 
				 	        STARPU_CUDA_LDFLAGS="${SAVED_STARPU_CUDA_LDFLAGS} -L$__cuda_lib_dir"
			
 
				-	        LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
			
 
				 	        AC_HAVE_LIBRARY([cudart],[have_valid_cuda=yes],[have_valid_cuda=no])
			
 
				                 unset ac_cv_lib_cudart_main
			
 
				             fi
			
@@ -359,6 +403,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
				 
			
 
				     if test "$have_valid_cuda" = "yes" ; then
			
 
				         STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcudart"
			
 
				+	LDFLAGS="${SAVED_LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
			
 
				 	# we also check that CUBLAS is available
			
 
				 	AC_HAVE_LIBRARY([cublas],[have_valid_cuda=yes],[have_valid_cuda=no])
			
 
				         unset ac_cv_lib_cublas_main
			
@@ -379,7 +424,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
				 if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
			
 
				     STARPU_CHECK_CUDA($cuda_dir, $cuda_lib_dir)
			
 
				     if test "$have_valid_cuda" = "no" ; then
			
 
				-        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
			
 
				+        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
			
 
				             STARPU_CHECK_CUDA($f, "no")
			
 
				             if test "$have_valid_cuda" = "yes" ; then
			
 
				                 break
			
@@ -390,7 +435,7 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 
				     if test "$have_valid_cuda" = "yes" ; then
			
 
				         STARPU_CHECK_CUDA_RUNTIME($cuda_dir, $cuda_include_dir, $cuda_lib_dir)
			
 
				         if test "$have_valid_cuda" = "no" ; then
			
 
				-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
			
 
				+            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
			
 
				                 STARPU_CHECK_CUDA_RUNTIME($f, "no", "no")
			
 
				                 if test "$have_valid_cuda" = "yes" ; then
			
 
				                     break
			
@@ -399,8 +444,24 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 
				         fi
			
 
				     fi
			
 
				 
			
 
				+    # Check cuda is compatible with the C compiler
			
 
				+    AC_MSG_CHECKING(whether CUDA is working)
			
 
				+    if test "$have_valid_cuda" = "yes" ; then
			
 
				+        SAVED_CPPFLAGS="${CPPFLAGS}"
			
 
				+        CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
			
 
				+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
			
 
				+		[[#include <cuda.h>]],
			
 
				+		[[]]
			
 
				+		),
			
 
				+	    [have_valid_cuda="yes"],
			
 
				+	    [have_valid_cuda="no"]
			
 
				+	])
			
 
				+        CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				+    fi
			
 
				+    AC_MSG_RESULT($have_valid_cuda)
			
 
				+
			
 
				     # in case CUDA was explicitely required, but is not available, this is an error
			
 
				-    if test x$enable_cuda = xyes -a x$have_valid_cuda = no; then
			
 
				+    if test x$enable_cuda = xyes -a x$have_valid_cuda = xno; then
			
 
				 	AC_MSG_ERROR([cannot find CUDA])
			
 
				     fi
			
 
				     # now we enable CUDA if and only if a proper setup is available
			
@@ -609,21 +670,28 @@ AC_ARG_WITH(opencl-lib-dir,
 
				 		enable_opencl=yes
			
 
				 	], [opencl_lib_dir=no])
			
 
				 
			
 
				-if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
			
 
				-    	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
			
 
				-        if test "$have_valid_opencl" = "no" ; then
			
 
				-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
			
 
				-                if test -n $f ; then
			
 
				-    	            STARPU_CHECK_OPENCL($f, "no", "no")
			
 
				-                    if test "$have_valid_opencl" = "yes" ; then
			
 
				-                        break
			
 
				-                    fi
			
 
				-                fi
			
 
				-            done
			
 
				-        fi
			
 
				+AC_DEFUN([STARPU_LOOK_FOR_OPENCL],
			
 
				+[
			
 
				+    	if test "x$has_opencl_being_checked" != "xyes" ; then
			
 
				+    	    STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
			
 
				+	    if test "$have_valid_opencl" = "no" ; then
			
 
				+            	for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
			
 
				+		    if test -n $f ; then
			
 
				+    			STARPU_CHECK_OPENCL($f, "no", "no")
			
 
				+			if test "$have_valid_opencl" = "yes" ; then
			
 
				+			    break
			
 
				+			fi
			
 
				+		    fi
			
 
				+		done
			
 
				+	    fi
			
 
				+	    has_opencl_being_checked=yes
			
 
				+	fi
			
 
				+])
			
 
				 
			
 
				+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
			
 
				+	STARPU_LOOK_FOR_OPENCL()
			
 
				 	# in case OpenCL was explicitely required, but is not available, this is an error
			
 
				-	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
			
 
				+	if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
			
 
				 	    AC_MSG_ERROR([cannot find OpenCL])
			
 
				 	fi
			
 
				 
			
@@ -684,7 +752,7 @@ if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
 
				 	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
			
 
				 
			
 
				 	# in case Gordon was explicitely required, but is not available, this is an error
			
 
				-	if test x$enable_gordon = xyes -a x$have_valid_gordon = no; then
			
 
				+	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
			
 
				 		AC_MSG_ERROR([cannot find Gordon])
			
 
				 	fi
			
 
				 
			
@@ -727,6 +795,7 @@ AC_MSG_RESULT($enable_debug)
 
				 
			
 
				 if test x$enable_debug = xyes; then
			
 
				 	CFLAGS="$CFLAGS -O0"
			
 
				+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
			
 
				 else
			
 
				 	CFLAGS="$CFLAGS -O3"
			
 
				 fi
			
@@ -741,6 +810,14 @@ if test x$enable_fast = xyes; then
 
				 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
			
 
				 fi
			
 
				 
			
 
				+AC_MSG_CHECKING(whether memory status should be displayed)
			
 
				+AC_ARG_ENABLE(memory-status, [AS_HELP_STRING([--enable-memory-status],
			
 
				+			     [display memory status at the end of execution])],
			
 
				+			     enable_memory_status=$enableval, enable_memory_status=no)
			
 
				+AC_MSG_RESULT($enable_memory_status)
			
 
				+if test x$enable_memory_status = xyes; then
			
 
				+        AC_DEFINE(STARPU_MEMORY_STATUS, [1], [display memory status])
			
 
				+fi
			
 
				 
			
 
				 
			
 
				 AC_MSG_CHECKING(whether debug messages should be displayed)
			
@@ -927,7 +1004,7 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of worker
 
				 AC_MSG_CHECKING(maximum number of implementations)
			
 
				 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
			
 
				 		[maximum number of implementations])],
			
 
				-		maximplementations=$enableval, maximplementations=1)
			
 
				+		maximplementations=$enableval, maximplementations=4)
			
 
				 AC_MSG_RESULT($maximplementations)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
			
 
				 		[maximum number of implementations])
			
@@ -1031,45 +1108,63 @@ fi
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				-build_starpu_top=no
			
 
				-AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
			
 
				-if test x$QMAKE != xnot-found; then
			
 
				-	QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
			
 
				-	if test $QMAKE_VERSION -ge 2 ; then
			
 
				-		PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
			
 
				-			QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
			
 
				-			QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
			
 
				-			if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
			
 
				-				build_starpu_top=yes
			
 
				-			fi
			
 
				-			QWT_PRI=embed
			
 
				-			AC_ARG_WITH(qwt-include-dir,
			
 
				-				[AS_HELP_STRING([--with-qwt-include-dir=<path>],
			
 
				-				[specify installed libqwt include path])],
			
 
				-				[
			
 
				-					STARPU_QWT_CPPFLAGS="-I$withval"
			
 
				-					AC_SUBST(STARPU_QWT_CPPFLAGS)
			
 
				-					QWT_PRI=system
			
 
				-				])
			
 
				-			AC_ARG_WITH(qwt-lib-dir,
			
 
				-				[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
			
 
				-				[specify installed libqwt library path])],
			
 
				-				[
			
 
				-					STARPU_QWT_LDFLAGS="-L$withval"
			
 
				-					QWT_PRI=system
			
 
				-				])
			
 
				-			AC_ARG_WITH(qwt-lib,
			
 
				-				[AS_HELP_STRING([--with-qwt-lib=<path>],
			
 
				-				[specify installed libqwt library name])],
			
 
				-				[
			
 
				-					STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
			
 
				-					QWT_PRI=system
			
 
				-				])
			
 
				-			AC_SUBST(QWT_PRI)
			
 
				-		])
			
 
				+AC_ARG_ENABLE([starpu-top],
			
 
				+  [AS_HELP_STRING([--disable-starpu-top],
			
 
				+    [build StarPU-Top])],
			
 
				+  [enable_starpu_top="no"],
			
 
				+  [enable_starpu_top="maybe"])
			
 
				+
			
 
				+# Check whether StarPU-Top can be built
			
 
				+AC_MSG_CHECKING(for StarPU-Top)
			
 
				+
			
 
				+if test "x$enable_starpu_top" = "xmaybe" ; then
			
 
				+	can_build_starpu_top=no
			
 
				+	AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
			
 
				+	if test x$QMAKE != xnot-found; then
			
 
				+		QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
			
 
				+		if test $QMAKE_VERSION -ge 2 ; then
			
 
				+			PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
			
 
				+				QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
			
 
				+				QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
			
 
				+				if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
			
 
				+					can_build_starpu_top=yes
			
 
				+				fi
			
 
				+				QWT_PRI=embed
			
 
				+				AC_ARG_WITH(qwt-include-dir,
			
 
				+					[AS_HELP_STRING([--with-qwt-include-dir=<path>],
			
 
				+					[specify installed libqwt include path])],
			
 
				+					[
			
 
				+						STARPU_QWT_INCLUDE="$withval"
			
 
				+						AC_SUBST(STARPU_QWT_INCLUDE)
			
 
				+						QWT_PRI=system
			
 
				+					])
			
 
				+				AC_ARG_WITH(qwt-lib-dir,
			
 
				+					[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
			
 
				+					[specify installed libqwt library path])],
			
 
				+					[
			
 
				+						STARPU_QWT_LDFLAGS="-L$withval"
			
 
				+						QWT_PRI=system
			
 
				+					])
			
 
				+				AC_ARG_WITH(qwt-lib,
			
 
				+					[AS_HELP_STRING([--with-qwt-lib=<name>],
			
 
				+					[specify installed libqwt library name])],
			
 
				+					[
			
 
				+						STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
			
 
				+						QWT_PRI=system
			
 
				+					])
			
 
				+				AC_SUBST(STARPU_QWT_LDFLAGS)
			
 
				+				AC_SUBST(QWT_PRI)
			
 
				+			])
			
 
				+		fi
			
 
				 	fi
			
 
				 fi
			
 
				 
			
 
				+if test "x$enable_starpu_top" = "xmaybe" ; then
			
 
				+  build_starpu_top=$can_build_starpu_top
			
 
				+else
			
 
				+  build_starpu_top=no
			
 
				+fi
			
 
				+
			
 
				 AM_CONDITIONAL(BUILD_STARPU_TOP, test x$build_starpu_top = xyes)
			
 
				 
			
 
				 ###############################################################################
			
@@ -1088,7 +1183,7 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 
				 	AC_MSG_CHECKING([whether compiler support $1])
			
 
				 
			
 
				 	SAVED_CFLAGS="$CFLAGS"
			
 
				-	CFLAGS="$1 -we10006"
			
 
				+	CFLAGS="$1" # -we10006"
			
 
				 
			
 
				 	AC_COMPILE_IFELSE(
			
 
				 		AC_LANG_PROGRAM(
			
@@ -1117,6 +1212,11 @@ if test "x$STARPU_DEVEL" != x; then
 
				 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
			
 
				 fi
			
 
				 
			
 
				+# Same value as Automake's, for use in other places.
			
 
				+pkglibdir="\${libdir}/$PACKAGE"
			
 
				+AC_SUBST([pkglibdir])
			
 
				+
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                               GCC extensions                                #
			
@@ -1127,61 +1227,114 @@ AC_ARG_ENABLE([gcc-extensions],
 
				   [AS_HELP_STRING([--enable-gcc-extensions],
			
 
				     [build the GCC plug-in that provides C language extensions (experimental)])],
			
 
				   [enable_gcc_plugin="$enableval"],
			
 
				-  [enable_gcc_plugin="no"])
			
 
				+  [enable_gcc_plugin="maybe"])
			
 
				 
			
 
				-if test "x$enable_gcc_plugin" = "xyes"; then
			
 
				-   STARPU_GCC_PLUGIN_SUPPORT
			
 
				+if test "x$enable_gcc_plugin" = "xyes" -o "x$enable_gcc_plugin" = "xmaybe" ; then
			
 
				+    STARPU_GCC_PLUGIN_SUPPORT
			
 
				 
			
 
				-   if test "x$ac_cv_have_gcc_plugins" != "xyes"; then
			
 
				-     AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
			
 
				-   fi
			
 
				+    if test "x$ac_cv_have_gcc_plugins" = "xno" ; then
			
 
				+        if test "x$enable_gcc_plugin" = "xyes" ; then
			
 
				+    	    # Since this was explicitly asked for, error out.
			
 
				+            AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
			
 
				+	else
			
 
				+	    AC_MSG_WARN([GCC plug-ins not supported; StarPU's GCC plug-in will not be built])
			
 
				+        fi
			
 
				+    else
			
 
				+        # What GCC version are we using?
			
 
				+        STARPU_GCC_VERSION
			
 
				+
			
 
				+        # The `.so' itself cannot be called `starpu-gcc.so' (because
			
 
				+	# `-fplugin-arg-' option names and such must match the `.so'
			
 
				+	# name), so use a meaningful directory name.
			
 
				+	gccplugindir="\${pkglibdir}/${STARPU_EFFECTIVE_VERSION}/gcc/${STARPU_GCC_VERSION_MAJOR}.${STARPU_GCC_VERSION_MINOR}"
			
 
				+	AC_SUBST([gccplugindir])
			
 
				+
			
 
				+	# Lines to be inserted in the `.pc' file.
			
 
				+	GCC_PLUGIN_DIR_PKGCONFIG="gccplugindir=$gccplugindir"
			
 
				+	GCC_PLUGIN_PKGCONFIG="gccplugin=\${gccplugindir}/starpu.so"
			
 
				+	AC_SUBST([GCC_PLUGIN_DIR_PKGCONFIG])
			
 
				+	AC_SUBST([GCC_PLUGIN_PKGCONFIG])
			
 
				+    fi
			
 
				+fi
			
 
				 
			
 
				-   build_gcc_plugin="yes"
			
 
				 
			
 
				-   # GNU Guile 1.8/2.0 is used to run the test suite.
			
 
				-   AC_PATH_PROG([GUILE], [guile])
			
 
				-   if test "x$GUILE" != "x"; then
			
 
				-      run_gcc_plugin_test_suite="yes"
			
 
				-   else
			
 
				-      run_gcc_plugin_test_suite="no"
			
 
				-   fi
			
 
				+if test "x$ac_cv_have_gcc_plugins" = "xyes" ; then
			
 
				+    build_gcc_plugin="yes"
			
 
				+
			
 
				+    # GNU Guile 1.8/2.0 is used to run the test suite.
			
 
				+    AC_PATH_PROG([GUILE], [guile])
			
 
				+    if test "x$GUILE" != "x"; then
			
 
				+        if test "x$enable_cpu" = "xyes"; then
			
 
				+	   run_gcc_plugin_test_suite="yes"
			
 
				+	else
			
 
				+	   AC_MSG_WARN([CPU back-end disabled; GCC plug-in test suite will not be run])
			
 
				+	   run_gcc_plugin_test_suite="no"
			
 
				+	fi
			
 
				+    else
			
 
				+	run_gcc_plugin_test_suite="no"
			
 
				+    fi
			
 
				 else
			
 
				-   build_gcc_plugin="no"
			
 
				-   run_gcc_plugin_test_suite="no"
			
 
				+    build_gcc_plugin="no"
			
 
				+    run_gcc_plugin_test_suite="no"
			
 
				 fi
			
 
				 
			
 
				 # Bison is used to generate the C expression parser.  The generated
			
 
				 # parser is part of the distribution, though.
			
 
				-AC_PROG_YACC
			
 
				+AM_MISSING_PROG([YACC], [bison])
			
 
				 
			
 
				 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
			
 
				 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-#                               OpenCL interface                              #
			
 
				+#                               SOCL interface                                #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				 AC_ARG_ENABLE([socl],
			
 
				   [AS_HELP_STRING([--enable-socl],
			
 
				-    [build the OpenCL interface (SOCL)])],
			
 
				+    [build the OpenCL interface (experimental)])],
			
 
				   [enable_socl="$enableval"],
			
 
				-  [enable_socl="no"])
			
 
				+  [enable_socl="maybe"])
			
 
				 
			
 
				-if test "x$enable_socl" = "xyes"; then
			
 
				-   STARPU_SOCL_SUPPORT
			
 
				-   build_socl="yes"
			
 
				+AC_MSG_CHECKING(for SOCL)
			
 
				+
			
 
				+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
			
 
				+    if test "$have_valid_opencl" = "no" ; then
			
 
				+	STARPU_LOOK_FOR_OPENCL()
			
 
				+    fi
			
 
				+fi
			
 
				+
			
 
				+# in case SOCL was explicitely required, but is not available, this is an error
			
 
				+if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
			
 
				+    AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
			
 
				+fi
			
 
				+
			
 
				+# now we enable SOCL if and only if a proper setup is available
			
 
				+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
			
 
				+   build_socl=$have_valid_opencl
			
 
				 else
			
 
				-   build_socl="no"
			
 
				-   run_socl_test_suite="no"
			
 
				+   build_socl=no
			
 
				 fi
			
 
				 
			
 
				+AC_MSG_RESULT($build_socl)
			
 
				 AM_CONDITIONAL([BUILD_SOCL], [test "x$build_socl" = "xyes"])
			
 
				 AM_CONDITIONAL([STARPU_USE_SOCL], [test "x$build_socl" = "xyes"])
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				+#                                 Debugging                                   #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_PATH_PROG([GDB], [gdb], [not-found])
			
 
				+if test "x$GDB" != "xnot-found"; then
			
 
				+   AC_DEFINE_UNQUOTED([STARPU_GDB_PATH], ["$GDB"],
			
 
				+     [Path to the GNU debugger.])
			
 
				+fi
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				 #                                  Examples                                   #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
@@ -1203,10 +1356,10 @@ AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 
				 AC_MSG_RESULT($enable_opengl_render)
			
 
				 
			
 
				 AC_PATH_XTRA
			
 
				-if test "x$x_includes" != "xNONE"; then
			
 
				+if test "x$no_x" != "xyes"; then
			
 
				 	AC_DEFINE(STARPU_HAVE_X11, [1], [enable X11])
			
 
				 fi
			
 
				-AM_CONDITIONAL([HAVE_X11], [test "x$x_includes" != "xNONE"])
			
 
				+AM_CONDITIONAL([HAVE_X11], [test "x$no_x" != "xyes"])
			
 
				 
			
 
				 # In case there are BLAS kernels that are used by the example applications
			
 
				 # we may specify which library to use. Note that this is not used for StarPU
			
@@ -1330,6 +1483,11 @@ AC_SUBST(BLAS_LIB,$blas_lib)
 
				 have_fftw=no
			
 
				 have_fftwf=no
			
 
				 have_fftwl=no
			
 
				+fft_support=no
			
 
				+
			
 
				+AC_ARG_ENABLE(starpufft, [AS_HELP_STRING([--disable-starpufft],
			
 
				+			[Disable build of StarPU-FFT])],
			
 
				+			enable_starpufft=$enableval,enable_starpufft=yes)
			
 
				 
			
 
				 PKG_CHECK_MODULES([FFTW],  [fftw3],  [
			
 
				   AC_DEFINE([STARPU_HAVE_FFTW], [1], [Define to 1 if you have the libfftw3 library.])
			
@@ -1337,7 +1495,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
 
				   have_fftw=yes
			
 
				 ], [:])
			
 
				 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
			
 
				- 
			
 
				+
			
 
				 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
			
 
				   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
			
 
				   AC_SUBST([STARPU_HAVE_FFTWF], [1])
			
@@ -1352,6 +1510,11 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 
				 ], [:])
			
 
				 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
			
 
				 
			
 
				+if test x$enable_starpufft = xyes -a \( \( x$enable_cpu = xyes -a x$have_fftw = xyes -a x$have_fftwf = xyes \) -o x$have_cufftdoublecomplex = xyes \); then
			
 
				+   fft_support=yes
			
 
				+fi
			
 
				+AM_CONDITIONAL(BUILD_STARPUFFT, [test x$fft_support = xyes])
			
 
				+
			
 
				 ##########################################
			
 
				 # hwloc                                  #
			
 
				 ##########################################
			
@@ -1407,10 +1570,56 @@ AC_ARG_ENABLE(optional_tests, [AS_HELP_STRING([--optional-tests],
 
				 AC_MSG_RESULT($want_optional_tests)
			
 
				 AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
			
 
				 
			
 
				+# Check if icc is available
			
 
				+AC_CHECK_PROGS([ICC], [icc])
			
 
				+
			
 
				+# If cuda and icc are both available, check they are compatible
			
 
				+if test "$enable_cuda" = "yes" -a "$ICC" != ""; then
			
 
				+   AC_MSG_CHECKING(whether CUDA and ICC are compatible)
			
 
				+   OLD_CC="$CC"
			
 
				+   CC="$ICC"
			
 
				+   AC_COMPILE_IFELSE(
			
 
				+       AC_LANG_PROGRAM(
			
 
				+	   [[#include <cuda.h>]],
			
 
				+	   [[]]
			
 
				+	   ),
			
 
				+       AC_MSG_RESULT(yes),
			
 
				+       [ICC=""
			
 
				+           AC_MSG_RESULT(no)]
			
 
				+   )
			
 
				+   CC="$OLD_CC"
			
 
				+fi
			
 
				+
			
 
				+# Disable ICC on windows
			
 
				+if test "x$ICC" != "x" -a "$starpu_windows" = "yes" ; then
			
 
				+    ICC=""
			
 
				+fi
			
 
				+if test "x$ICC" != "x"; then
			
 
				+  AC_DEFINE(STARPU_HAVE_ICC, [], [Define this if icc is available])
			
 
				+fi
			
 
				+AM_CONDITIONAL([STARPU_HAVE_ICC], [test "x$ICC" != "x"])
			
 
				+
			
 
				+# Do not generate manpages for the tools if we do not have help2man
			
 
				+AC_CHECK_PROGS([HELP2MAN], [help2man])
			
 
				+# Disable on windows
			
 
				+if test "$starpu_windows" = "yes" ; then
			
 
				+    HELP2MAN=""
			
 
				+fi
			
 
				+AM_CONDITIONAL([STARPU_HAVE_HELP2MAN], [test "x$HELP2MAN" != "x"])
			
 
				+
			
 
				+AC_CHECK_MEMBER([struct cudaDeviceProp.pciDomainID],
			
 
				+  AC_DEFINE([STARPU_HAVE_DOMAINID],[1],[Define to 1 if CUDA device properties include DomainID]),
			
 
				+  , [[#include <cuda_runtime_api.h>]])
			
 
				+
			
 
				+AC_CHECK_MEMBER([struct cudaDeviceProp.pciBusID],
			
 
				+  AC_DEFINE([STARPU_HAVE_BUSID],[1],[Define to 1 if CUDA device properties include BusID]),
			
 
				+  , [[#include <cuda_runtime_api.h>]])
			
 
				+
			
 
				 # File configuration
			
 
				 AC_CONFIG_COMMANDS([executable-scripts], [
			
 
				   chmod +x tests/regression/regression.sh
			
 
				   chmod +x gcc-plugin/tests/run-test
			
 
				+  chmod +x tools/starpu_workers_activity
			
 
				 ])
			
 
				 
			
 
				 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
			
@@ -1420,19 +1629,27 @@ AC_OUTPUT([
 
				 	Makefile
			
 
				 	src/Makefile
			
 
				 	tools/Makefile
			
 
				+	tools/starpu_workers_activity
			
 
				 	socl/Makefile
			
 
				 	socl/src/Makefile
			
 
				+	socl/examples/Makefile
			
 
				+        socl/socl-1.0.pc
			
 
				 	libstarpu.pc
			
 
				+	starpu-1.0.pc
			
 
				+	mpi/libstarpumpi.pc
			
 
				+	mpi/starpumpi-1.0.pc
			
 
				+	starpufft/Makefile
			
 
				+	starpufft/libstarpufft.pc
			
 
				+	starpufft/starpufft-1.0.pc
			
 
				 	examples/Makefile
			
 
				         examples/opt/Makefile
			
 
				-	examples/starpufft/Makefile
			
 
				 	examples/stencil/Makefile
			
 
				-	examples/socl/Makefile
			
 
				 	tests/Makefile
			
 
				         tests/opt/Makefile
			
 
				 	doc/Makefile
			
 
				 	mpi/Makefile
			
 
				 	starpu-top/StarPU-Top.pro
			
 
				+	starpu-top/StarPU-Top-qwt-system.pri
			
 
				         gcc-plugin/Makefile
			
 
				 	gcc-plugin/src/Makefile
			
 
				 	gcc-plugin/tests/Makefile
			
@@ -1450,9 +1667,6 @@ AC_MSG_NOTICE([
 
				 	OpenCL enabled: $enable_opencl
			
 
				 	Cell   enabled: $enable_gordon
			
 
				 
			
 
				-	GCC plug-in: $build_gcc_plugin
			
 
				-	GCC plug-in test suite: $run_gcc_plugin_test_suite
			
 
				-
			
 
				 	Compile-time limits
			
 
				 	(change these with --enable-maxcpus, --enable-maxcudadev,
			
 
				 	--enable-maxopencldev, --enable-maxbuffers)
			
@@ -1466,12 +1680,17 @@ AC_MSG_NOTICE([
 
				 	GPU-GPU transfers: $have_cuda_memcpy_peer
			
 
				 	Allocation cache:  $enable_allocation_cache
			
 
				 
			
 
				-	MPI enabled:   $use_mpi
			
 
				-	SOCL enabled:  $build_socl
			
 
				 	Magma enabled: $have_magma
			
 
				 	BLAS library:  $blas_lib
			
 
				 	hwloc:         $have_valid_hwloc
			
 
				-
			
 
				 	FxT trace enabled: $use_fxt
			
 
				 	StarPU-Top:        $build_starpu_top
			
 
				+
			
 
				+	StarPU Extensions:
			
 
				+	       MPI enabled:   $use_mpi
			
 
				+	       MPI test suite: $running_mpi_check
			
 
				+	       FFT Support: $fft_support
			
 
				+	       GCC plug-in: $build_gcc_plugin
			
 
				+	       GCC plug-in test suite: $run_gcc_plugin_test_suite
			
 
				+	       SOCL enabled:  $build_socl
			
 
				 ])
			
--- a/libstarpu.pc.in
+++ b/libstarpu.pc.in
@@ -6,8 +6,8 @@ includedir=@includedir@
 
				 Name: starpu
			
 
				 Description: offers support for heterogeneous multicore architecture
			
 
				 Version: @PACKAGE_VERSION@
			
 
				-Cflags: -I${includedir} @STARPU_CUDA_CPPFLAGS@
			
 
				-Libs: -L${libdir} -lstarpu @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
			
 
				+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
			
 
				 Libs.private: @LDFLAGS@ @LIBS@
			
 
				 Requires: @HWLOC_REQUIRES@
			
 
				 Requires.private: @GORDON_REQUIRES@
			
--- a/starpu-1.0.pc.in
+++ b/starpu-1.0.pc.in
@@ -0,0 +1,35 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+pkglibdir=@pkglibdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+# When the GCC plug-in is available, the following lines indicate
			
 
				+# where it is installed.
			
 
				+@GCC_PLUGIN_DIR_PKGCONFIG@
			
 
				+@GCC_PLUGIN_PKGCONFIG@
			
 
				+
			
 
				+Name: starpu
			
 
				+Description: offers support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
			
 
				+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@
			
 
				+Requires: @HWLOC_REQUIRES@
			
 
				+Requires.private: @GORDON_REQUIRES@
			
--- a/starpu-top/StarPU-Top-common.pri
+++ b/starpu-top/StarPU-Top-common.pri
@@ -28,7 +28,7 @@ SOURCES += $$SRCDIR/main.cpp \
 
				     $$SRCDIR/aboutdialog.cpp

			
 
				 HEADERS += $$SRCDIR/mainwindow.h \

			
 
				 #STARPU-TOP

			
 
				-    $$SRCDIR/starputoptypes.h \

			
 
				+    $$SRCDIR/starpu_top_types.h \

			
 
				     $$SRCDIR/widgetwindowsmanager.h \

			
 
				     $$SRCDIR/configurationmanager.h \

			
 
				     $$SRCDIR/communicationthread.h \

			
--- a/starpu-top/StarPU-Top-qwt-system.pri
+++ b/starpu-top/StarPU-Top-qwt-system.pri
@@ -1,2 +0,0 @@
 
				-LIBS += -lqwt-qt4

			
 
				-INCLUDEPATH += /usr/include/qwt-qt4

			
--- a/starpu-top/StarPU-Top-qwt-system.pri.in
+++ b/starpu-top/StarPU-Top-qwt-system.pri.in
@@ -0,0 +1,2 @@
 
				+LIBS += @STARPU_QWT_LDFLAGS@

			
 
				+INCLUDEPATH += @STARPU_QWT_INCLUDE@

			
--- a/starpu-top/aboutdialog.ui
+++ b/starpu-top/aboutdialog.ui
@@ -112,7 +112,7 @@
 
				       <string/>

			
 
				      </property>

			
 
				      <property name="pixmap">

			
 
				-      <pixmap resource="resources.qrc">:/images/starputop.png</pixmap>

			
 
				+      <pixmap resource="resources.qrc">:/images/starpu_top.png</pixmap>

			
 
				      </property>

			
 
				      <property name="scaledContents">

			
 
				       <bool>true</bool>

			
--- a/starpu-top/communicationmanager.cpp
+++ b/starpu-top/communicationmanager.cpp
@@ -70,7 +70,7 @@ void CommunicationManager::initializeSession()
 
				 {
			
 
				     _dataDescriptions = new QList<DataDescription*> ();
			
 
				     _paramDescriptions = new QList<ParamDescription*> ();
			
 
				-    _serverDevices = new QList<StarputopDevice> ;
			
 
				+    _serverDevices = new QList<starpu_top_device> ;
			
 
				 
			
 
				     _serverInfoMsgCount = 0;
			
 
				     _state = COM_STATE_INIT;
			
@@ -665,7 +665,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
 
				         Q_ASSERT_X(ok == true, "CommunicationManager::parseInitDevMessage()",
			
 
				                    "Bogus message received in INIT DEV");
			
 
				 
			
 
				-        StarputopDeviceType deviceType;
			
 
				+        starpu_top_device_type deviceType;
			
 
				 
			
 
				         Q_ASSERT_X(
			
 
				                 deviceTypeString.compare(
			
@@ -701,7 +701,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
 
				             deviceType = SERVERDEVICE_GORDON;
			
 
				         }
			
 
				 
			
 
				-        StarputopDevice device;
			
 
				+        starpu_top_device device;
			
 
				         device.id = deviceId;
			
 
				         device.type = deviceType;
			
 
				         device.name = deviceNameString;
			
--- a/starpu-top/communicationmanager.h
+++ b/starpu-top/communicationmanager.h
@@ -27,7 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 #define COMMUNICATIONMANAGER_H
			
 
				 
			
 
				 #include <QTcpSocket>
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 
			
 
				 class CommunicationManager : public QTcpSocket
			
 
				 { /* Receives protocol messages from server, parses them
			
@@ -54,7 +54,7 @@ private:
 
				     qlonglong _serverTimestamp;
			
 
				     QList<DataDescription*> *_dataDescriptions;
			
 
				     QList<ParamDescription*> *_paramDescriptions;
			
 
				-    QList<StarputopDevice> *_serverDevices;
			
 
				+    QList<starpu_top_device> *_serverDevices;
			
 
				     // Communication states
			
 
				     CommunicationState _state;
			
 
				     bool _initServerInfoCompleted;
			
@@ -125,7 +125,7 @@ signals:
 
				     void serverInitCompleted(QString serverID,
			
 
				                              QList<DataDescription*> *dataDescriptions,
			
 
				                              QList<ParamDescription*> *paramDescriptions,
			
 
				-                             QList<StarputopDevice> *serverDevices);
			
 
				+                             QList<starpu_top_device> *serverDevices);
			
 
				     // Notify GUI with a protocol message
			
 
				     // Protocol error
			
 
				     void protocolError(QString errorMessage);
			
--- a/starpu-top/communicationthread.cpp
+++ b/starpu-top/communicationthread.cpp
@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 #include "configurationmanager.h"
			
 
				 #include "mainwindow.h"
			
 
				 #include "communicationmanager.h"
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 
			
 
				 const int MAX_CONNECTION_ATTEMPTS = 10;
			
 
				 
			
@@ -103,12 +103,12 @@ void CommunicationThread::createNewCommunicationManager(void)
 
				                      SIGNAL(serverInitCompleted(QString,
			
 
				                                                 QList<DataDescription*>*,
			
 
				                                                 QList<ParamDescription*>*,
			
 
				-                                                QList<StarputopDevice>*)),
			
 
				+                                                QList<Starpu_TopDevice>*)),
			
 
				                      _mainWindow, SLOT(initClient(
			
 
				                              QString,
			
 
				                              QList<DataDescription*>*,
			
 
				                              QList<ParamDescription*>*,
			
 
				-                             QList<StarputopDevice>*)));
			
 
				+                             QList<Starpu_TopDevice>*)));
			
 
				     // Output data
			
 
				     QObject::connect(_mainWindow, SIGNAL(clientLaunched()),
			
 
				                      _communicationManager, SLOT(sendGoMessage()));
			
--- a/starpu-top/configurationmanager.h
+++ b/starpu-top/configurationmanager.h
@@ -29,7 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 #include <QSettings>

			
 
				 

			
 
				 static const QString CONFIG_FILE_DIR = ".";

			
 
				-static const QString CONFIG_FILE_NAME = "starputop.cfg";

			
 
				+static const QString CONFIG_FILE_NAME = "starpu_top.cfg";

			
 
				 

			
 
				 class ConfigurationManager

			
 
				 { /* Contains and manages all the application settings

			
--- a/starpu-top/dataaggregatorwidget.h
+++ b/starpu-top/dataaggregatorwidget.h
@@ -34,7 +34,7 @@ class QwtPlot;
 
				 
			
 
				 #include <QHash>
			
 
				 #include <QAction>
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 #include "abstractwidgetwindow.h"
			
 
				 
			
 
				 class DataAggregatorWidget : public AbstractWidgetWindow
			
--- a/starpu-top/datawidget.h
+++ b/starpu-top/datawidget.h
@@ -31,7 +31,7 @@ class WidgetWindowsManager;
 
				 class QwtPlotCurve;
			
 
				 class QwtPlot;
			
 
				 
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 #include "abstractwidgetwindow.h"
			
 
				 
			
 
				 class DataWidget : public AbstractWidgetWindow
			
--- a/starpu-top/extradist
+++ b/starpu-top/extradist
@@ -9,9 +9,8 @@ EXTRA_DIST	+=	\
 
				                 starpu-top/abstractwidgetwindow.cpp     \
			
 
				                 starpu-top/communicationthread.h        \
			
 
				                 starpu-top/configurationmanager.cpp     \
			
 
				-                starpu-top/starputoptypes.h             \
			
 
				+                starpu-top/starpu_top_types.h             \
			
 
				                 starpu-top/mainwindow.ui                \
			
 
				-                starpu-top/debug                        \
			
 
				                 starpu-top/mainwindow.cpp               \
			
 
				                 starpu-top/sessionsetupmanager.cpp      \
			
 
				                 starpu-top/resources.qrc                \
			
@@ -19,7 +18,7 @@ EXTRA_DIST	+=	\
 
				                 starpu-top/images/connect.png           \
			
 
				                 starpu-top/images/debugon.png           \
			
 
				                 starpu-top/images/help.png              \
			
 
				-                starpu-top/images/starputop.png         \
			
 
				+                starpu-top/images/starpu_top.png         \
			
 
				                 starpu-top/images/widget.png            \
			
 
				                 starpu-top/images/lock.png              \
			
 
				                 starpu-top/images/about.png             \
			
@@ -45,7 +44,6 @@ EXTRA_DIST	+=	\
 
				                 starpu-top/debugconsole.ui                      \
			
 
				                 starpu-top/dataaggregatorwidget.cpp             \
			
 
				                 starpu-top/datawidget.cpp                       \
			
 
				-                starpu-top/release                              \
			
 
				                 starpu-top/datawidget.h                         \
			
 
				                 starpu-top/debugconsole.cpp                     \
			
 
				                 starpu-top/ganttwidget.h                        \
			
--- a/starpu-top/ganttwidget.cpp
+++ b/starpu-top/ganttwidget.cpp
@@ -469,7 +469,7 @@ void GanttWidget::drawFromTime(QPainter *painter, qlonglong timestamp)
 
				         borneBefore = 0;
			
 
				     }
			
 
				     _tasks = _taskManager->tasks(borneBefore, _timePresent);
			
 
				-    foreach(StarputopTask t, _tasks)
			
 
				+    foreach(starpu_top_task t, _tasks)
			
 
				     {
			
 
				 	drawWorkPU(painter,t);
			
 
				     }
			
@@ -526,7 +526,7 @@ void GanttWidget::drawIdlePU(QPainter *painter)
 
				 }
			
 
				 
			
 
				 /* draw forecasted working time for each processor */
			
 
				-void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
			
 
				+void GanttWidget::drawPrevWorkPU(QPainter *painter, starpu_top_task t)
			
 
				 {
			
 
				     int starty = HEIGHT_TIME_AXIS + MARGIN;
			
 
				     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
			
@@ -623,7 +623,7 @@ void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
 
				  we haven't to test if they are displayable or not. We just have to calculate
			
 
				  which part of time is displayable.
			
 
				  The task t has its begin or its end between time Before and timePresent */
			
 
				-void GanttWidget::drawWorkPU(QPainter *painter, StarputopTask t)
			
 
				+void GanttWidget::drawWorkPU(QPainter *painter, starpu_top_task t)
			
 
				 {
			
 
				     int starty = HEIGHT_TIME_AXIS + MARGIN;
			
 
				     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
			
@@ -783,12 +783,12 @@ void GanttWidget::countPUs()
 
				     _numPUs = length;
			
 
				     delete _PUsByDevice;
			
 
				     delete _PUsByPos;
			
 
				-    _PUsByDevice = new StarputopDevice[length];
			
 
				-    _PUsByPos = new StarputopDevice[length];
			
 
				+    _PUsByDevice = new starpu_top_device[length];
			
 
				+    _PUsByPos = new starpu_top_device[length];
			
 
				     int pos = 0;
			
 
				 
			
 
				     /* CPUs */
			
 
				-    foreach(StarputopDevice sD,*_mainWindow->serverDevices())
			
 
				+    foreach(starpu_top_device sD,*_mainWindow->serverDevices())
			
 
				     {
			
 
				 	if(sD.type == 0)
			
 
				 	{
			
@@ -806,7 +806,7 @@ void GanttWidget::countPUs()
 
				     }
			
 
				 
			
 
				     /* GPUs */
			
 
				-    foreach (StarputopDevice sD , *_mainWindow->serverDevices())
			
 
				+    foreach (starpu_top_device sD , *_mainWindow->serverDevices())
			
 
				     {
			
 
				 	if(sD.type == 1 || sD.type == 2)
			
 
				 	{
			
@@ -855,7 +855,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
 
				             }
			
 
				 
			
 
				             _tasks = _taskManager->tasks(borneBefore, _timePresent);
			
 
				-            foreach (StarputopTask t, _tasks)
			
 
				+            foreach (starpu_top_task t, _tasks)
			
 
				             {
			
 
				                 drawWorkPU(painter,t);
			
 
				             }
			
@@ -863,7 +863,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
 
				             /* Future past */
			
 
				             qlonglong borneAfter = _timePresent + _timeAfter;
			
 
				             _tasks = _taskManager->prevTasks(_timePresent, borneAfter);
			
 
				-            foreach		(StarputopTask t, _tasks)
			
 
				+            foreach		(starpu_top_task t, _tasks)
			
 
				             {
			
 
				                 drawPrevWorkPU(painter,t);
			
 
				             }
			
--- a/starpu-top/ganttwidget.h
+++ b/starpu-top/ganttwidget.h
@@ -31,7 +31,7 @@ class TaskManager;
 
				 

			
 
				 #include <QGLWidget>

			
 
				 #include <QPainter>

			
 
				-#include "starputoptypes.h"

			
 
				+#include "starpu_top_types.h"

			
 
				 

			
 
				 class GanttWidget : public QGLWidget

			
 
				 {

			
@@ -58,9 +58,9 @@ protected:
 
				     void drawTime(QPainter *painter);

			
 
				     void drawProgram(QPainter *painter);

			
 
				     void resizeGL (int width,int height);

			
 
				-    void drawWorkPU(QPainter *painter, StarputopTask t);

			
 
				+    void drawWorkPU(QPainter *painter, starpu_top_task t);

			
 
				     void drawIdlePU(QPainter *painter);

			
 
				-    void drawPrevWorkPU(QPainter *painter, StarputopTask t);

			
 
				+    void drawPrevWorkPU(QPainter *painter, starpu_top_task t);

			
 
				     void defaultScreen(QPainter *painter);

			
 
				     void drawPresentLine(QPainter *painter);

			
 
				     int computeTimeInterval(int timeTotal);

			
@@ -84,7 +84,7 @@ private:
 
				     qreal _coordxPresentLine;

			
 
				     int _numPUs;

			
 
				     bool _wasRunning;

			
 
				-    QList<StarputopTask> _tasks;

			
 
				+    QList<starpu_top_task> _tasks;

			
 
				     int _timeTotal;

			
 
				     int _timeAfter;

			
 
				     int _timeBefore;

			
@@ -92,8 +92,8 @@ private:
 
				     QTimer *_timer;

			
 
				     qlonglong _timePresent;

			
 
				     qlonglong _timeToShow;

			
 
				-    StarputopDevice *_PUsByDevice;

			
 
				-    StarputopDevice *_PUsByPos;

			
 
				+    starpu_top_device *_PUsByDevice;

			
 
				+    starpu_top_device *_PUsByPos;

			
 
				     int _numCPUs;

			
 
				     int _numGPUs;

			
 
				     bool _initCompleted;

			
--- a/starpu-top/images/starpu_top.png
+++ b/starpu-top/images/starpu_top.png
--- a/starpu-top/interactivewidget.h
+++ b/starpu-top/interactivewidget.h
@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 #include <QCloseEvent>
			
 
				 #include <QLabel>
			
 
				 #include <QHBoxLayout>
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 
			
 
				 class MainWindow;
			
 
				 
			
--- a/starpu-top/mainwindow.cpp
+++ b/starpu-top/mainwindow.cpp
@@ -61,7 +61,7 @@ MainWindow::MainWindow(QWidget *parent) :
 
				     _dataAggregatorWidgets = new QList<QPointer<DataAggregatorWidget> > ();
			
 
				     _dataDescriptions = new QList<DataDescription*> ();
			
 
				     _paramDescriptions = new QList<ParamDescription*> ();
			
 
				-    _serverDevices = new QList<StarputopDevice> ();
			
 
				+    _serverDevices = new QList<starpu_top_device> ();
			
 
				     _nbDataWidgets = _nbInteractiveWidgets = _nbDataAggregatorWidgets = 0;
			
 
				 
			
 
				     // Init managers
			
@@ -97,18 +97,18 @@ MainWindow::MainWindow(QWidget *parent) :
 
				     QObject::connect(settingsAction, SIGNAL(triggered()), this,
			
 
				                      SLOT(on_actionPreferences_triggered()));
			
 
				     connectButton->addAction(settingsAction);
			
 
				-    ui->menuStarputop->addAction(_actionConnect);
			
 
				+    ui->menu_starpu_top->addAction(_actionConnect);
			
 
				     // Action launch
			
 
				     _actionLaunch = ui->mainToolBar->addAction(QIcon(":/images/play.png"),
			
 
				                                                tr("Launch StarPU"));
			
 
				     _actionLaunch->setIconText("Launch StarPU");
			
 
				     _actionLaunch->setToolTip("Launch StarPU");
			
 
				     _actionLaunch->setShortcut(QKeySequence("Ctrl+L"));
			
 
				-    ui->menuStarputop->addAction(_actionLaunch);
			
 
				+    ui->menu_starpu_top->addAction(_actionLaunch);
			
 
				     QObject::connect(_actionLaunch, SIGNAL(triggered()), this,
			
 
				                      SLOT(on_actionLaunch_StarPU_triggered()));
			
 
				     ui->mainToolBar->addSeparator();
			
 
				-    ui->menuStarputop->addSeparator();
			
 
				+    ui->menu_starpu_top->addSeparator();
			
 
				     // Action debug
			
 
				     _actionDebug = ui->mainToolBar->addAction(QIcon(":/images/debugon.png"),
			
 
				                                               tr("Enable debug"));
			
@@ -116,7 +116,7 @@ MainWindow::MainWindow(QWidget *parent) :
 
				     _actionDebug->setToolTip("Enable debug");
			
 
				     _actionDebug->setShortcut(QKeySequence("Ctrl+D"));
			
 
				     _actionDebug->setCheckable(true);
			
 
				-    ui->menuStarputop->addAction(_actionDebug);
			
 
				+    ui->menu_starpu_top->addAction(_actionDebug);
			
 
				     QObject::connect(_actionDebug, SIGNAL(toggled(bool)),
			
 
				                      this, SLOT(on_actionDebug_triggered(bool)));
			
 
				     // Action save session setup
			
@@ -125,7 +125,7 @@ MainWindow::MainWindow(QWidget *parent) :
 
				     _actionSaveSessionSetup->setIconText("Save session setup");
			
 
				     _actionSaveSessionSetup->setToolTip("Save session setup");
			
 
				     _actionSaveSessionSetup->setShortcut(QKeySequence("Ctrl+S"));
			
 
				-    ui->menuStarputop->addAction(_actionSaveSessionSetup);
			
 
				+    ui->menu_starpu_top->addAction(_actionSaveSessionSetup);
			
 
				     QObject::connect(_actionSaveSessionSetup, SIGNAL(triggered()), this,
			
 
				                      SLOT(on_actionSaveSessionSetup_triggered()));
			
 
				     // Action add data aggregator widget
			
@@ -135,13 +135,13 @@ MainWindow::MainWindow(QWidget *parent) :
 
				     _actionAddDataAggregatorWidget->setIconText("Add data aggregator widget");
			
 
				     _actionAddDataAggregatorWidget->setToolTip("Add data aggregator widget");
			
 
				     _actionAddDataAggregatorWidget->setShortcut(QKeySequence("Ctrl+G"));
			
 
				-    ui->menuStarputop->addAction(_actionAddDataAggregatorWidget);
			
 
				+    ui->menu_starpu_top->addAction(_actionAddDataAggregatorWidget);
			
 
				     QObject::connect(_actionAddDataAggregatorWidget, SIGNAL(triggered()), this,
			
 
				                      SLOT(on_actionAddDataAggregatorWidget_triggered()));
			
 
				     ui->mainToolBar->addSeparator();
			
 
				-    ui->menuStarputop->addSeparator();
			
 
				+    ui->menu_starpu_top->addSeparator();
			
 
				     // Action quit
			
 
				-    QAction *actionQuit = ui->menuStarputop->addAction(
			
 
				+    QAction *actionQuit = ui->menu_starpu_top->addAction(
			
 
				             QIcon(":/images/quit.png"), tr("Quit"));
			
 
				     actionQuit->setIconText("Quit");
			
 
				     actionQuit->setToolTip("Quit");
			
@@ -540,7 +540,7 @@ void MainWindow::synchronizeSessionTime(qlonglong serverTimestamp)
 
				 void MainWindow::initClient(QString serverID,
			
 
				                             QList<DataDescription*> *dataDescriptions,
			
 
				                             QList<ParamDescription*> *paramDescriptions,
			
 
				-                            QList<StarputopDevice> *serverDevices)
			
 
				+                            QList<starpu_top_device> *serverDevices)
			
 
				 {
			
 
				     _serverID = serverID;
			
 
				     _dataDescriptions = dataDescriptions;
			
@@ -1213,7 +1213,7 @@ ParamDescription *MainWindow::paramDescriptionFromId(int paramId)
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				-const QList<StarputopDevice> *MainWindow::serverDevices() const
			
 
				+const QList<starpu_top_device> *MainWindow::serverDevices() const
			
 
				 {
			
 
				     return _serverDevices;
			
 
				 }
			
--- a/starpu-top/mainwindow.h
+++ b/starpu-top/mainwindow.h
@@ -49,7 +49,7 @@ class TaskManager;
 
				 #include <QAbstractSocket>
			
 
				 #include <QTime>
			
 
				 #include <QSpinBox>
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 
			
 
				 namespace Ui
			
 
				 {
			
@@ -79,7 +79,7 @@ public:
 
				     const QList<ParamDescription*> *paramDescriptions() const;
			
 
				     DataDescription *dataDescriptionFromId(int dataId);
			
 
				     ParamDescription *paramDescriptionFromId(int interactiveId);
			
 
				-    const QList<StarputopDevice> *serverDevices() const;
			
 
				+    const QList<starpu_top_device> *serverDevices() const;
			
 
				     // Get different widgets metadata
			
 
				     const QHash<DataWidgetType, QString> *dataWidgetNames() const;
			
 
				     const QHash<DataType, QSet<DataWidgetType> >
			
@@ -166,7 +166,7 @@ private:
 
				     // Different descriptions
			
 
				     QList<DataDescription*> *_dataDescriptions;
			
 
				     QList<ParamDescription*> *_paramDescriptions;
			
 
				-    QList<StarputopDevice> *_serverDevices;
			
 
				+    QList<starpu_top_device> *_serverDevices;
			
 
				     int _nbDataWidgets;
			
 
				     int _nbDataAggregatorWidgets;
			
 
				     int _nbInteractiveWidgets;
			
@@ -233,7 +233,7 @@ public slots:
 
				     void initClient(QString serverID,
			
 
				                     QList<DataDescription*> *dataDescriptions,
			
 
				                     QList<ParamDescription*> *paramDescriptions,
			
 
				-                    QList<StarputopDevice> *serverDevices);
			
 
				+                    QList<starpu_top_device> *serverDevices);
			
 
				     // Connection events handlers
			
 
				     void connectionSucceeded();
			
 
				     void connectionAborted(QString message);
			
--- a/starpu-top/mainwindow.ui
+++ b/starpu-top/mainwindow.ui
@@ -21,7 +21,7 @@
 
				   </property>

			
 
				   <property name="windowIcon">

			
 
				    <iconset resource="resources.qrc">

			
 
				-    <normaloff>:/images/starputop.png</normaloff>:/images/starputop.png</iconset>

			
 
				+    <normaloff>:/images/starpu_top.png</normaloff>:/images/starpu_top.png</iconset>

			
 
				   </property>

			
 
				   <widget class="QWidget" name="centralWidget">

			
 
				    <layout class="QGridLayout" name="gridLayout_2">

			
@@ -42,7 +42,7 @@
 
				      <height>21</height>

			
 
				     </rect>

			
 
				    </property>

			
 
				-   <widget class="QMenu" name="menuStarputop">

			
 
				+   <widget class="QMenu" name="menu_starpu_top">

			
 
				     <property name="title">

			
 
				      <string>StarPU-Top</string>

			
 
				     </property>

			
@@ -59,7 +59,7 @@
 
				     </property>

			
 
				     <addaction name="actionPreferences"/>

			
 
				    </widget>

			
 
				-   <addaction name="menuStarputop"/>

			
 
				+   <addaction name="menu_starpu_top"/>

			
 
				    <addaction name="menuDisplay"/>

			
 
				    <addaction name="menuHelp"/>

			
 
				   </widget>

			
--- a/starpu-top/preferencesdialog.h
+++ b/starpu-top/preferencesdialog.h
@@ -33,7 +33,7 @@ class SessionSetupManager;
 
				 #include <QMetaType>

			
 
				 #include <QDialog>

			
 
				 #include <QComboBox>

			
 
				-#include "starputoptypes.h"

			
 
				+#include "starpu_top_types.h"

			
 
				 

			
 
				 namespace Ui

			
 
				 {

			
--- a/starpu-top/resources.qrc
+++ b/starpu-top/resources.qrc
@@ -13,7 +13,7 @@
 
				         <file>images/add.png</file>
			
 
				         <file>images/remove.png</file>
			
 
				         <file>images/widget.png</file>
			
 
				-        <file>images/starputop.png</file>
			
 
				+        <file>images/starpu_top.png</file>
			
 
				         <file>images/windows.png</file>
			
 
				         <file>images/lock.png</file>
			
 
				     </qresource>
			
--- a/starpu-top/sessionsetupmanager.h
+++ b/starpu-top/sessionsetupmanager.h
@@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 

			
 
				 class MainWindow;

			
 
				 

			
 
				-#include "starputoptypes.h"

			
 
				+#include "starpu_top_types.h"

			
 
				 #include <QSettings>

			
 
				 

			
 
				 static const QString SESSION_SETUPS_DIR = "./sessionsetups";

			
--- a/starpu-top/starputoptypes.h
+++ b/starpu-top/starputoptypes.h
@@ -23,8 +23,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 */

			
 
				 

			
 
				 

			
 
				-#ifndef STARPUTOPTYPES_H

			
 
				-#define STARPUTOPTYPES_H

			
 
				+#ifndef STARPU_TOP_TYPES_H

			
 
				+#define STARPU_TOP_TYPES_H

			
 
				 

			
 
				 #include <QString>

			
 
				 #include <QStringList>

			
@@ -112,7 +112,7 @@ enum ParamType
 
				     PARAM_TYPE_ENUM = 4,

			
 
				 };

			
 
				 

			
 
				-enum StarputopDeviceType

			
 
				+enum starpu_top_device_type

			
 
				 {

			
 
				     SERVERDEVICE_CPU = 0,

			
 
				     SERVERDEVICE_CUDA = 1,

			
@@ -124,9 +124,9 @@ enum StarputopDeviceType
 
				 typedef struct

			
 
				 {

			
 
				     int id;

			
 
				-    StarputopDeviceType type;

			
 
				+    starpu_top_device_type type;

			
 
				     QString name;

			
 
				-} StarputopDevice;

			
 
				+} starpu_top_device;

			
 
				 

			
 
				 // Server tasks

			
 
				 typedef struct

			
@@ -135,7 +135,7 @@ typedef struct
 
				     int deviceId;

			
 
				     qlonglong timestampStart;

			
 
				     qlonglong timestampEnd;

			
 
				-} StarputopTask;

			
 
				+} starpu_top_task;

			
 
				 

			
 
				 // Descriptions

			
 
				 typedef struct

			
@@ -318,4 +318,4 @@ typedef struct
 
				     QList<int> dataIds;

			
 
				 } DataAggregatorWidgetSetup;

			
 
				 

			
 
				-#endif // STARPUTOPTYPES_H

			
 
				+#endif // STARPU_TOP_TYPES_H

			
--- a/starpu-top/taskmanager.cpp
+++ b/starpu-top/taskmanager.cpp
@@ -171,10 +171,10 @@ void TaskManager::addTaskEnd(int taskId, qlonglong timestampEnd)
 
				     }
			
 
				 }
			
 
				 
			
 
				-QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
			
 
				-                                        qlonglong timestampEnd)
			
 
				+QList<starpu_top_task> TaskManager::tasks(qlonglong timestampStart,
			
 
				+					  qlonglong timestampEnd)
			
 
				 {
			
 
				-    QList < StarputopTask > tasks;
			
 
				+    QList < starpu_top_task > tasks;
			
 
				 
			
 
				     _selectTasksQuery.addBindValue(timestampStart);
			
 
				     _selectTasksQuery.addBindValue(timestampEnd);
			
@@ -206,7 +206,7 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
 
				             qlonglong timestampEnd =
			
 
				                     _selectTasksQuery.value(endField).toLongLong();
			
 
				 
			
 
				-            StarputopTask task;
			
 
				+            starpu_top_task task;
			
 
				             task.taskId = taskId;
			
 
				             task.deviceId = deviceId;
			
 
				             task.timestampStart = timestampStart;
			
@@ -220,10 +220,10 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
 
				     return tasks;
			
 
				 }
			
 
				 
			
 
				-QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
			
 
				+QList<starpu_top_task> TaskManager::prevTasks(qlonglong timestampStart,
			
 
				                                             qlonglong timestampEnd)
			
 
				 {
			
 
				-    QList < StarputopTask > prevTasks;
			
 
				+    QList < starpu_top_task > prevTasks;
			
 
				 
			
 
				     _selectPrevTasksQuery.addBindValue(timestampStart);
			
 
				     _selectPrevTasksQuery.addBindValue(timestampEnd);
			
@@ -255,7 +255,7 @@ QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
 
				             qlonglong timestampEnd =
			
 
				                     _selectPrevTasksQuery.value(endField).toLongLong();
			
 
				 
			
 
				-            StarputopTask prevTask;
			
 
				+            starpu_top_task prevTask;
			
 
				             prevTask.taskId = taskId;
			
 
				             prevTask.deviceId = deviceId;
			
 
				             prevTask.timestampStart = timestampStart;
			
--- a/starpu-top/taskmanager.h
+++ b/starpu-top/taskmanager.h
@@ -26,7 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
				 #ifndef TASKMANAGER_H
			
 
				 #define TASKMANAGER_H
			
 
				 
			
 
				-#include "starputoptypes.h"
			
 
				+#include "starpu_top_types.h"
			
 
				 #include <QDebug>
			
 
				 #include <QtSql/QSqlDatabase>
			
 
				 #include <QtSql/QSqlQuery>
			
@@ -46,9 +46,9 @@ public:
 
				     void addTaskStart(int taskId, int deviceId, qlonglong timestampStart);
			
 
				     void addTaskEnd(int taskId, qlonglong timestampEnd);
			
 
				     // Getters
			
 
				-    QList<StarputopTask> tasks(qlonglong timestampStart,
			
 
				+    QList<starpu_top_task> tasks(qlonglong timestampStart,
			
 
				                                qlonglong timestampEnd);
			
 
				-    QList<StarputopTask> prevTasks(qlonglong timestampStart,
			
 
				+    QList<starpu_top_task> prevTasks(qlonglong timestampStart,
			
 
				                                    qlonglong timestampEnd);
			
 
				 
			
 
				 private:
			
--- a/starpufft/.gitignore
+++ b/starpufft/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/starpufft/Makefile.am
+++ b/starpufft/Makefile.am
@@ -0,0 +1,97 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+
			
 
				+lib_LTLIBRARIES = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+
			
 
				+EXTRA_DIST =			\
			
 
				+	float.h			\
			
 
				+	double.h		\
			
 
				+	cudax_kernels.h		\
			
 
				+	starpufftx.c		\
			
 
				+	starpufftx1d.c		\
			
 
				+	starpufftx2d.c		\
			
 
				+	cuda_kernels.cu		\
			
 
				+	cudaf_kernels.cu	\
			
 
				+	cudax_kernels.cu	\
			
 
				+	examples/testx.c	\
			
 
				+	examples/testx_threads.c\
			
 
				+	examples/testf_threads.c\
			
 
				+	examples/test_threads.c
			
 
				+
			
 
				+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
			
 
				+versinclude_HEADERS = 				\
			
 
				+	starpufft.h
			
 
				+
			
 
				+pkgconfigdir = $(libdir)/pkgconfig
			
 
				+pkgconfig_DATA = libstarpufft.pc starpufft-1.0.pc
			
 
				+
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = starpufft.c starpufftf.c starpufft_common.c
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS) $(FFTWF_LIBS) $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUFFT_LDFLAGS)
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(FFTWF_CFLAGS)
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
			
 
				+  -version-info $(LIBSTARPUFFT_INTERFACE_CURRENT):$(LIBSTARPUFFT_INTERFACE_REVISION):$(LIBSTARPUFFT_INTERFACE_AGE)
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC
			
 
				+
			
 
				+cudaf_kernels.o: cudaf_kernels.cu
			
 
				+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir}
			
 
				+
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cudaf_kernels.cu
			
 
				+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS = cudaf_kernels.o starpufft.lo starpufftf.lo starpufft_common.lo
			
 
				+
			
 
				+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
			
 
				+cuda_kernels.o: cuda_kernels.cu
			
 
				+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir} -arch sm_13
			
 
				+
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cuda_kernels.cu
			
 
				+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS += cuda_kernels.o
			
 
				+endif
			
 
				+
			
 
				+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD +=  $(STARPU_CUDA_LDFLAGS)
			
 
				+endif
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/examples/starpufft
			
 
				+examplebin_PROGRAMS =				\
			
 
				+	examples/testf \
			
 
				+	examples/test
			
 
				+
			
 
				+check_PROGRAMS = examples/testf
			
 
				+examples_testf_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTWF_LIBS)
			
 
				+
			
 
				+# If we don't have CUDA, we assume that we have fftw available in double
			
 
				+# precision anyway, we just want to make sure that if CUFFT is used, it also
			
 
				+# supports double precision.
			
 
				+if !STARPU_USE_CUDA
			
 
				+check_PROGRAMS += examples/test
			
 
				+else
			
 
				+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
			
 
				+check_PROGRAMS += examples/test
			
 
				+endif
			
 
				+endif
			
 
				+examples_test_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS)
			
 
				+
			
 
				+TESTS = $(check_PROGRAMS)
			
 
				+
			
 
				+
			
 
				+#check_PROGRAMS += examples/test_threads examples/testf_threads
			
 
				+#examples_test_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3_threads
			
 
				+#examples_testf_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3f_threads
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
--- a/starpufft/cuda_kernels.cu
+++ b/starpufft/cuda_kernels.cu
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "cudax_kernels.cu"
			
--- a/starpufft/cudaf_kernels.cu
+++ b/starpufft/cudaf_kernels.cu
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "cudax_kernels.cu"
			
--- a/starpufft/cudax_kernels.cu
+++ b/starpufft/cudax_kernels.cu
@@ -0,0 +1,156 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define _externC extern "C"
			
 
				+#include "cudax_kernels.h"
			
 
				+
			
 
				+/* Note: these assume that the sizes are powers of two */
			
 
				+
			
 
				+#define VARS_1d \
			
 
				+	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
			
 
				+	unsigned numthreads = blockDim.x * gridDim.x;
			
 
				+
			
 
				+#define DISTRIB_1d(n, func,args) \
			
 
				+	unsigned threads_per_block = 128; \
			
 
				+\
			
 
				+	if (n < threads_per_block) \
			
 
				+	{			   \
			
 
				+		dim3 dimGrid(n); \
			
 
				+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+	} 					\
			
 
				+	else 					\
			
 
				+	{				     \
			
 
				+		dim3 dimGrid(n / threads_per_block); \
			
 
				+		dim3 dimBlock(threads_per_block); \
			
 
				+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+	} \
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
 
				+
			
 
				+extern "C" __global__ void
			
 
				+STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+	VARS_1d
			
 
				+	unsigned end = n2;
			
 
				+
			
 
				+	for (j = start; j < end; j += numthreads)
			
 
				+		twisted1[j] = in[i+j*n1];
			
 
				+}
			
 
				+
			
 
				+extern "C" void
			
 
				+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
			
 
				+{
			
 
				+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
			
 
				+}
			
 
				+
			
 
				+extern "C" __global__ void
			
 
				+STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned n, unsigned i)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+	VARS_1d
			
 
				+	unsigned end = n;
			
 
				+
			
 
				+	for (j = start; j < end; j += numthreads)
			
 
				+		out[j] = _cuCmul(out[j], roots[i*j]);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+extern "C" void
			
 
				+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
			
 
				+{
			
 
				+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
			
 
				+}
			
 
				+
			
 
				+#define VARS_2d \
			
 
				+	unsigned startx = threadIdx.x + blockIdx.x * blockDim.x; \
			
 
				+	unsigned starty = threadIdx.y + blockIdx.y * blockDim.y; \
			
 
				+	unsigned numthreadsx = blockDim.x * gridDim.x; \
			
 
				+	unsigned numthreadsy = blockDim.y * gridDim.y;
			
 
				+
			
 
				+/* FIXME: introduce threads_per_dim_n / m instead */
			
 
				+#define DISTRIB_2d(n, m, func, args) \
			
 
				+	unsigned threads_per_dim = 16; \
			
 
				+	if (n < threads_per_dim) \
			
 
				+	{				   \
			
 
				+		if (m < threads_per_dim) \
			
 
				+		{			    \
			
 
				+			dim3 dimGrid(n, m); \
			
 
				+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		} \
			
 
				+		else \
			
 
				+		{					      \
			
 
				+			dim3 dimGrid(1, m / threads_per_dim); \
			
 
				+			dim3 dimBlock(n, threads_per_dim); \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		} \
			
 
				+	} \
			
 
				+	else \
			
 
				+	{				   \
			
 
				+		if (m < threads_per_dim) \
			
 
				+		{					      \
			
 
				+			dim3 dimGrid(n / threads_per_dim, 1); \
			
 
				+			dim3 dimBlock(threads_per_dim, m); \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		} \
			
 
				+		else \
			
 
				+		{							\
			
 
				+			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
			
 
				+			dim3 dimBlock(threads_per_dim, threads_per_dim); \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		} \
			
 
				+	} \
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
 
				+
			
 
				+extern "C" __global__ void
			
 
				+STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
			
 
				+{
			
 
				+	unsigned k, l;
			
 
				+	VARS_2d
			
 
				+	unsigned endx = n2;
			
 
				+	unsigned endy = m2;
			
 
				+	unsigned m = m1*m2;
			
 
				+
			
 
				+	for (k = startx; k < endx; k += numthreadsx)
			
 
				+		for (l = starty; l < endy; l += numthreadsy)
			
 
				+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
			
 
				+}
			
 
				+
			
 
				+extern "C" void
			
 
				+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
			
 
				+{
			
 
				+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
			
 
				+}
			
 
				+
			
 
				+extern "C" __global__ void
			
 
				+STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _cuComplex * roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
			
 
				+{
			
 
				+	unsigned k, l;
			
 
				+	VARS_2d
			
 
				+	unsigned endx = n2;
			
 
				+	unsigned endy = m2;
			
 
				+
			
 
				+	for (k = startx; k < endx ; k += numthreadsx)
			
 
				+		for (l = starty; l < endy ; l += numthreadsy)
			
 
				+			out[k*m2 + l] = _cuCmul(_cuCmul(out[k*m2 + l], roots0[i*k]), roots1[j*l]);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+extern "C" void
			
 
				+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
			
 
				+{
			
 
				+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
			
 
				+}
			
--- a/starpufft/cudax_kernels.h
+++ b/starpufft/cudax_kernels.h
@@ -0,0 +1,23 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <cuComplex.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
			
 
				+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
			
 
				+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
			
 
				+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);
			
--- a/starpufft/double.h
+++ b/starpufft/double.h
@@ -0,0 +1,51 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+#include <fftw3.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cufft.h>
			
 
				+#endif
			
 
				+
			
 
				+#undef  FLOAT
			
 
				+#define DOUBLE
			
 
				+
			
 
				+typedef double real;
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+typedef fftw_complex _fftw_complex;
			
 
				+typedef fftw_plan _fftw_plan;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+typedef cuDoubleComplex _cuComplex;
			
 
				+typedef cufftDoubleComplex _cufftComplex;
			
 
				+#define _cufftExecC2C cufftExecZ2Z
			
 
				+#define _cufftExecR2C cufftExecD2Z
			
 
				+#define _cufftExecC2R cufftExecZ2D
			
 
				+#define _CUFFT_C2C CUFFT_Z2Z
			
 
				+#define _CUFFT_R2C CUFFT_D2Z
			
 
				+#define _CUFFT_C2R CUFFT_Z2D
			
 
				+#define _cuCmul(x,y) cuCmul(x,y)
			
 
				+#endif
			
 
				+#define STARPUFFT(name) starpufft_##name
			
 
				+#define _FFTW(name) fftw_##name
			
 
				+
			
 
				+#define TYPE ""
			
--- a/starpufft/examples/test.c
+++ b/starpufft/examples/test.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "testx.c"
			
--- a/starpufft/examples/test_threads.c
+++ b/starpufft/examples/test_threads.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "testx_threads.c"
			
--- a/starpufft/examples/testf.c
+++ b/starpufft/examples/testf.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "testx.c"
			
--- a/starpufft/examples/testf_threads.c
+++ b/starpufft/examples/testf_threads.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "testx_threads.c"
			
--- a/starpufft/examples/testx.c
+++ b/starpufft/examples/testx.c
@@ -0,0 +1,283 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include <math.h>
			
 
				+#include <unistd.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <assert.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+#include "starpufft.h"
			
 
				+
			
 
				+#undef STARPU_USE_CUDA
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+#include <fftw3.h>
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cufft.h>
			
 
				+#endif
			
 
				+
			
 
				+#define SIGN (-1)
			
 
				+/* #define SIGN (1) */
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+static void check_fftw(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
			
 
				+{
			
 
				+	int i;
			
 
				+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		double diff = cabs(out[i]-out_fftw[i]);
			
 
				+		double diff2 = diff * diff;
			
 
				+		double size = cabs(out_fftw[i]);
			
 
				+		double size2 = size * size;
			
 
				+		if (diff > max)
			
 
				+			max = diff;
			
 
				+		tot += diff;
			
 
				+		normdiff += diff2;
			
 
				+		norm += size2;
			
 
				+	}
			
 
				+	fprintf(stderr, "\nmaximum difference %g\n", max);
			
 
				+	fprintf(stderr, "average difference %g\n", tot / size);
			
 
				+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
			
 
				+	double relmaxdiff = max / sqrt(norm);
			
 
				+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
			
 
				+	double relavgdiff = (tot / size) / sqrt(norm);
			
 
				+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
			
 
				+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-7 || relavgdiff > 1e-7)) {
			
 
				+		fprintf(stderr, "Failure: Difference too big (TYPE f)\n");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
			
 
				+	{
			
 
				+		fprintf(stderr, "Failure: Difference too big\n");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void check_cuda(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
			
 
				+{
			
 
				+	int i;
			
 
				+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+	{
			
 
				+		double diff = cabs(out_cuda[i]-out_fftw[i]);
			
 
				+		double diff2 = diff * diff;
			
 
				+		double size = cabs(out_fftw[i]);
			
 
				+		double size2 = size * size;
			
 
				+		if (diff > max)
			
 
				+			max = diff;
			
 
				+		tot += diff;
			
 
				+		normdiff += diff2;
			
 
				+		norm += size2;
			
 
				+	}
			
 
				+	fprintf(stderr, "\nmaximum difference %g\n", max);
			
 
				+	fprintf(stderr, "average difference %g\n", tot / size);
			
 
				+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
			
 
				+	double relmaxdiff = max / sqrt(norm);
			
 
				+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
			
 
				+	double relavgdiff = (tot / size) / sqrt(norm);
			
 
				+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
			
 
				+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
			
 
				+		exit(EXIT_FAILURE);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	int i, ret;
			
 
				+	int size;
			
 
				+	int n = 0, m = 0;
			
 
				+	STARPUFFT(plan) plan;
			
 
				+	starpu_data_handle_t in_handle, out_handle;
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	_FFTW(plan) fftw_plan;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cufftHandle cuda_plan;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
			
 
				+	struct timeval begin, end;
			
 
				+	double timing;
			
 
				+	size_t bytes;
			
 
				+#endif
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	if (argc == 1)
			
 
				+	{
			
 
				+		n = 42;
			
 
				+		/* 1D */
			
 
				+		size = n;
			
 
				+	}
			
 
				+	else if (argc == 2)
			
 
				+	{
			
 
				+		n = atoi(argv[1]);
			
 
				+
			
 
				+		/* 1D */
			
 
				+		size = n;
			
 
				+	}
			
 
				+	else if (argc == 3)
			
 
				+	{
			
 
				+		n = atoi(argv[1]);
			
 
				+		m = atoi(argv[2]);
			
 
				+
			
 
				+		/* 2D */
			
 
				+		size = n * m;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
			
 
				+	bytes = size * sizeof(STARPUFFT(complex));
			
 
				+#endif
			
 
				+
			
 
				+	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
			
 
				+	starpu_srand48(0);
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		in[i] = starpu_drand48() + I * starpu_drand48();
			
 
				+
			
 
				+	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	STARPUFFT(complex) *out_cuda = STARPUFFT(malloc)(size * sizeof(*out_cuda));
			
 
				+#endif
			
 
				+
			
 
				+	if (argc <= 2)
			
 
				+	{
			
 
				+		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		fftw_plan = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			
 
				+			printf("erf\n");
			
 
				+#endif
			
 
				+
			
 
				+	}
			
 
				+	else if (argc == 3)
			
 
				+	{
			
 
				+		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		fftw_plan = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
			
 
				+#endif
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	gettimeofday(&begin, NULL);
			
 
				+	_FFTW(execute_dft)(fftw_plan, in, out_fftw);
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	_FFTW(destroy_plan)(fftw_plan);
			
 
				+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
			
 
				+	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	gettimeofday(&begin, NULL);
			
 
				+	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
			
 
				+		printf("erf2\n");
			
 
				+	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	cufftDestroy(cuda_plan);
			
 
				+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
			
 
				+	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
			
 
				+#endif
			
 
				+
			
 
				+	STARPUFFT(execute)(plan, in, out);
			
 
				+	STARPUFFT(showstats)(stdout);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	check_fftw(out, out_fftw, size);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	check_cuda(out, out_cuda, size);
			
 
				+#endif
			
 
				+
			
 
				+#if 1
			
 
				+	starpu_vector_data_register(&in_handle, 0, (uintptr_t) in, size, sizeof(*in));
			
 
				+	starpu_vector_data_register(&out_handle, 0, (uintptr_t) out, size, sizeof(*out));
			
 
				+
			
 
				+	STARPUFFT(execute_handle)(plan, in_handle, out_handle);
			
 
				+
			
 
				+	starpu_data_unregister(in_handle);
			
 
				+	starpu_data_unregister(out_handle);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	check_fftw(out, out_fftw, size);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	check_cuda(out, out_cuda, size);
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+	STARPUFFT(showstats)(stdout);
			
 
				+	STARPUFFT(destroy_plan)(plan);
			
 
				+
			
 
				+	printf("\n");
			
 
				+#if 0
			
 
				+	for (i = 0; i < 16; i++)
			
 
				+		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
			
 
				+	printf("\n\n");
			
 
				+	for (i = 0; i < 16; i++)
			
 
				+		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
			
 
				+	printf("\n\n");
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	for (i = 0; i < 16; i++)
			
 
				+		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
			
 
				+	printf("\n\n");
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+	STARPUFFT(free)(in);
			
 
				+	STARPUFFT(free)(out);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	STARPUFFT(free)(out_fftw);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	free(out_cuda);
			
 
				+#endif
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/starpufft/examples/testx_threads.c
+++ b/starpufft/examples/testx_threads.c
@@ -0,0 +1,113 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include <math.h>
			
 
				+#include <unistd.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <assert.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+#include "starpufft.h"
			
 
				+
			
 
				+#include <fftw3.h>
			
 
				+
			
 
				+#define SIGN (-1)
			
 
				+/* #define SIGN (1) */
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	int i;
			
 
				+	struct timeval begin, end;
			
 
				+	int size;
			
 
				+	size_t bytes;
			
 
				+	int n = 0, m = 0;
			
 
				+	_FFTW(plan) fftw_plan;
			
 
				+	double timing;
			
 
				+	char *num;
			
 
				+	int num_threads = 1;
			
 
				+
			
 
				+	_FFTW(init_threads)();
			
 
				+
			
 
				+	num = getenv("NUM_THREADS");
			
 
				+	if (num)
			
 
				+		num_threads = atoi(num);
			
 
				+	_FFTW(plan_with_nthreads)(num_threads);
			
 
				+
			
 
				+	if (argc < 2 || argc > 3)
			
 
				+	{
			
 
				+		fprintf(stderr,"need one or two size of vector\n");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	if (argc == 2)
			
 
				+	{
			
 
				+		n = atoi(argv[1]);
			
 
				+
			
 
				+		/* 1D */
			
 
				+		size = n;
			
 
				+	}
			
 
				+	else if (argc == 3)
			
 
				+	{
			
 
				+		n = atoi(argv[1]);
			
 
				+		m = atoi(argv[2]);
			
 
				+
			
 
				+		/* 2D */
			
 
				+		size = n * m;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+	bytes = size * sizeof(_FFTW(complex));
			
 
				+
			
 
				+	_FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in));
			
 
				+	starpu_srand48(0);
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		in[i] = starpu_drand48() + I * starpu_drand48();
			
 
				+
			
 
				+	_FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw));
			
 
				+
			
 
				+	if (argc == 2)
			
 
				+	{
			
 
				+		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
			
 
				+
			
 
				+	}
			
 
				+	else if (argc == 3)
			
 
				+	{
			
 
				+		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+	gettimeofday(&begin, NULL);
			
 
				+	_FFTW(execute)(fftw_plan);
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	_FFTW(destroy_plan)(fftw_plan);
			
 
				+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
			
 
				+	printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads));
			
 
				+
			
 
				+	printf("\n");
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/starpufft/float.h
+++ b/starpufft/float.h
@@ -0,0 +1,51 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+#include <fftw3.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cufft.h>
			
 
				+#endif
			
 
				+
			
 
				+#undef  DOUBLE
			
 
				+#define FLOAT
			
 
				+
			
 
				+typedef float real;
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+typedef fftwf_complex _fftw_complex;
			
 
				+typedef fftwf_plan _fftw_plan;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+typedef cuComplex _cuComplex;
			
 
				+typedef cufftComplex _cufftComplex;
			
 
				+#define _cufftExecC2C cufftExecC2C
			
 
				+#define _cufftExecR2C cufftExecR2C
			
 
				+#define _cufftExecC2R cufftExecC2R
			
 
				+#define _CUFFT_C2C CUFFT_C2C
			
 
				+#define _CUFFT_R2C CUFFT_R2C
			
 
				+#define _CUFFT_C2R CUFFT_C2R
			
 
				+#define _cuCmul(x,y) cuCmulf(x,y)
			
 
				+#endif
			
 
				+#define STARPUFFT(name) starpufftf_##name
			
 
				+#define _FFTW(name) fftwf_##name
			
 
				+
			
 
				+#define TYPE "f"
			
--- a/starpufft/libstarpufft.pc.in
+++ b/starpufft/libstarpufft.pc.in
@@ -0,0 +1,27 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpufft
			
 
				+Description: offers support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
			
 
				+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@
			
--- a/starpufft/starpufft-1.0.pc.in
+++ b/starpufft/starpufft-1.0.pc.in
@@ -0,0 +1,27 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpufft
			
 
				+Description: offers support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
			
 
				+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@
			
--- a/starpufft/starpufft.c
+++ b/starpufft/starpufft.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "double.h"
			
 
				+#include "starpufftx.c"
			
--- a/starpufft/starpufft.h
+++ b/starpufft/starpufft.h
@@ -0,0 +1,60 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <complex.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define STARPUFFT_FORWARD -1
			
 
				+#define STARPUFFT_INVERSE 1
			
 
				+
			
 
				+#define __STARPUFFT(name) starpufft_##name
			
 
				+#define __STARPUFFTF(name) starpufftf_##name
			
 
				+#define __STARPUFFTL(name) starpufftl_##name
			
 
				+
			
 
				+#define __STARPUFFT_INTERFACE(starpufft,real) \
			
 
				+typedef real _Complex starpufft(complex); \
			
 
				+\
			
 
				+typedef struct starpufft(plan) *starpufft(plan); \
			
 
				+\
			
 
				+starpufft(plan) starpufft(plan_dft_1d)(int n, int sign, unsigned flags); \
			
 
				+starpufft(plan) starpufft(plan_dft_2d)(int n, int m, int sign, unsigned flags); \
			
 
				+starpufft(plan) starpufft(plan_dft_r2c_1d)(int n, unsigned flags); \
			
 
				+starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
			
 
				+\
			
 
				+void *starpufft(malloc)(size_t n); \
			
 
				+void starpufft(free)(void *p); \
			
 
				+\
			
 
				+void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
			
 
				+struct starpu_task *starpufft(start)(starpufft(plan) p, void *in, void *out); \
			
 
				+\
			
 
				+void starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
			
 
				+struct starpu_task *starpufft(start_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
			
 
				+\
			
 
				+void starpufft(cleanup)(starpufft(plan) p); \
			
 
				+void starpufft(destroy_plan)(starpufft(plan) p); \
			
 
				+\
			
 
				+void starpufft(startstats)(void); \
			
 
				+void starpufft(stopstats)(void); \
			
 
				+void starpufft(showstats)(FILE *out);
			
 
				+
			
 
				+__STARPUFFT_INTERFACE(__STARPUFFT, double)
			
 
				+__STARPUFFT_INTERFACE(__STARPUFFTF, float)
			
 
				+__STARPUFFT_INTERFACE(__STARPUFFTL, long double)
			
 
				+
			
 
				+/* Internal use */
			
 
				+extern int starpufft_last_plan_number;
			
--- a/starpufft/starpufft_common.c
+++ b/starpufft/starpufft_common.c
@@ -0,0 +1,21 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "starpufft.h"
			
 
				+
			
 
				+/* Used as an identifier in starpu tags to let plans run concurrently */
			
 
				+int starpufft_last_plan_number;
			
--- a/starpufft/starpufftf.c
+++ b/starpufft/starpufftf.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "float.h"
			
 
				+#include "starpufftx.c"
			
--- a/starpufft/starpufftx.c
+++ b/starpufft/starpufftx.c
@@ -0,0 +1,454 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define PARALLEL 0
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <pthread.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <config.h>
			
 
				+
			
 
				+#include "starpufft.h"
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#define _externC extern
			
 
				+#include "cudax_kernels.h"
			
 
				+
			
 
				+#if defined(FLOAT) || defined(STARPU_HAVE_CUFFTDOUBLECOMPLEX)
			
 
				+#  define __STARPU_USE_CUDA
			
 
				+#else
			
 
				+#  undef __STARPU_USE_CUDA
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#define _FFTW_FLAGS FFTW_ESTIMATE
			
 
				+
			
 
				+/* Steps for the parallel variant */
			
 
				+enum steps
			
 
				+{
			
 
				+	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
			
 
				+};
			
 
				+
			
 
				+#define NUMBER_BITS 5
			
 
				+#define NUMBER_SHIFT (64 - NUMBER_BITS)
			
 
				+#define STEP_BITS 3
			
 
				+#define STEP_SHIFT (NUMBER_SHIFT - STEP_BITS)
			
 
				+
			
 
				+/* Tags for the steps of the parallel variant */
			
 
				+#define _STEP_TAG(plan, step, i) (((starpu_tag_t) plan->number << NUMBER_SHIFT) | ((starpu_tag_t)(step) << STEP_SHIFT) | (starpu_tag_t) (i))
			
 
				+
			
 
				+
			
 
				+#define I_BITS STEP_SHIFT
			
 
				+
			
 
				+enum type
			
 
				+{
			
 
				+	R2C,
			
 
				+	C2R,
			
 
				+	C2C
			
 
				+};
			
 
				+
			
 
				+static unsigned task_per_worker[STARPU_NMAXWORKERS];
			
 
				+static unsigned samples_per_worker[STARPU_NMAXWORKERS];
			
 
				+static struct timeval start, submit_tasks, end;
			
 
				+
			
 
				+/*
			
 
				+ *
			
 
				+ *	The actual kernels
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+struct STARPUFFT(plan)
			
 
				+{
			
 
				+	int number;	/* uniquely identifies the plan, for starpu tags */
			
 
				+
			
 
				+	int *n;
			
 
				+	int *n1;
			
 
				+	int *n2;
			
 
				+	int totsize;
			
 
				+	int totsize1;	/* Number of first-round tasks */
			
 
				+	int totsize2;	/* Size of first-round tasks */
			
 
				+	int totsize3;	/* Number of second-round tasks */
			
 
				+	int totsize4;	/* Size of second-round tasks */
			
 
				+	int dim;
			
 
				+	enum type type;
			
 
				+	int sign;
			
 
				+
			
 
				+	STARPUFFT(complex) *roots[2];
			
 
				+	starpu_data_handle_t roots_handle[2];
			
 
				+
			
 
				+	/* For each worker, we need some data */
			
 
				+	struct
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		/* CUFFT plans */
			
 
				+		cufftHandle plan1_cuda, plan2_cuda;
			
 
				+		/* Sequential version */
			
 
				+		cufftHandle plan_cuda;
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		/* FFTW plans */
			
 
				+		_fftw_plan plan1_cpu, plan2_cpu;
			
 
				+		/* Sequential version */
			
 
				+		_fftw_plan plan_cpu;
			
 
				+#endif
			
 
				+	} plans[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* Buffers for codelets */
			
 
				+	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
			
 
				+
			
 
				+	/* corresponding starpu DSM handles */
			
 
				+	starpu_data_handle_t in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle, out_handle;
			
 
				+
			
 
				+	/* Tasks */
			
 
				+	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
			
 
				+	struct starpu_task *join_task, *end_task;
			
 
				+
			
 
				+	/* Arguments for tasks */
			
 
				+	struct STARPUFFT(args) *fft1_args, *fft2_args;
			
 
				+};
			
 
				+
			
 
				+struct STARPUFFT(args)
			
 
				+{
			
 
				+	struct STARPUFFT(plan) *plan;
			
 
				+	int i, j, jj, kk, ll, *iv, *kkv;
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+check_dims(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	int dim;
			
 
				+	for (dim = 0; dim < plan->dim; dim++)
			
 
				+		if (plan->n[dim] & (plan->n[dim]-1))
			
 
				+		{
			
 
				+			fprintf(stderr,"can't cope with non-power-of-2\n");
			
 
				+			STARPU_ABORT();
			
 
				+		}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+compute_roots(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	int dim, k;
			
 
				+
			
 
				+	/* Compute the n-roots and m-roots of unity for twiddling */
			
 
				+	for (dim = 0; dim < plan->dim; dim++)
			
 
				+	{
			
 
				+		STARPUFFT(complex) exp = (plan->sign * 2. * 4.*atan(1.)) * _Complex_I / (STARPUFFT(complex)) plan->n[dim];
			
 
				+		plan->roots[dim] = malloc(plan->n[dim] * sizeof(**plan->roots));
			
 
				+		for (k = 0; k < plan->n[dim]; k++)
			
 
				+			plan->roots[dim][k] = cexp(exp*k);
			
 
				+		starpu_vector_data_register(&plan->roots_handle[dim], 0, (uintptr_t) plan->roots[dim], plan->n[dim], sizeof(**plan->roots));
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		if (plan->n[dim] > 100000)
			
 
				+		{
			
 
				+			/* prefetch the big root array on GPUs */
			
 
				+			unsigned worker;
			
 
				+			unsigned nworkers = starpu_worker_get_count();
			
 
				+			for (worker = 0; worker < nworkers; worker++)
			
 
				+			{
			
 
				+				unsigned node = starpu_worker_get_memory_node(worker);
			
 
				+				if (starpu_worker_get_type(worker) == STARPU_CUDA_WORKER)
			
 
				+					starpu_data_prefetch_on_node(plan->roots_handle[dim], node, 0);
			
 
				+			}
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Only CUDA capability >= 1.3 supports doubles, rule old card out.  */
			
 
				+#ifdef DOUBLE
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) {
			
 
				+	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+		return 1;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	{
			
 
				+		/* Cuda device */
			
 
				+		const struct cudaDeviceProp *props;
			
 
				+		props = starpu_cuda_get_device_properties(workerid);
			
 
				+		if (props->major >= 2 || props->minor >= 3)
			
 
				+			/* At least compute capability 1.3, supports doubles */
			
 
				+			return 1;
			
 
				+		/* Old card does not support doubles */
			
 
				+		return 0;
			
 
				+	}
			
 
				+#endif
			
 
				+	return 0;
			
 
				+}
			
 
				+#define CAN_EXECUTE .can_execute = can_execute,
			
 
				+#else
			
 
				+#define CAN_EXECUTE
			
 
				+#endif
			
 
				+
			
 
				+#include "starpufftx1d.c"
			
 
				+#include "starpufftx2d.c"
			
 
				+
			
 
				+struct starpu_task *
			
 
				+STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
			
 
				+{
			
 
				+	struct starpu_task *task;
			
 
				+	int z;
			
 
				+
			
 
				+	plan->in = _in;
			
 
				+	plan->out = _out;
			
 
				+
			
 
				+	switch (plan->dim)
			
 
				+	{
			
 
				+		case 1:
			
 
				+		{
			
 
				+			switch (plan->type)
			
 
				+			{
			
 
				+			case C2C:
			
 
				+				starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
			
 
				+				if (!PARALLEL)
			
 
				+					starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
			
 
				+				if (PARALLEL)
			
 
				+				{
			
 
				+					for (z = 0; z < plan->totsize1; z++)
			
 
				+						plan->twist1_tasks[z]->handles[0] = plan->in_handle;
			
 
				+				}
			
 
				+				task = STARPUFFT(start1dC2C)(plan, plan->in_handle, plan->out_handle);
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ABORT();
			
 
				+				break;
			
 
				+			}
			
 
				+			break;
			
 
				+		}
			
 
				+		case 2:
			
 
				+			starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
			
 
				+			if (!PARALLEL)
			
 
				+				starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
			
 
				+			if (PARALLEL)
			
 
				+			{
			
 
				+				for (z = 0; z < plan->totsize1; z++)
			
 
				+					plan->twist1_tasks[z]->handles[0] = plan->in_handle;
			
 
				+			}
			
 
				+			task = STARPUFFT(start2dC2C)(plan, plan->in_handle, plan->out_handle);
			
 
				+			break;
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+STARPUFFT(cleanup)(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	if (plan->in_handle)
			
 
				+		starpu_data_unregister(plan->in_handle);
			
 
				+	if (!PARALLEL)
			
 
				+	{
			
 
				+		if (plan->out_handle)
			
 
				+			starpu_data_unregister(plan->out_handle);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_task *
			
 
				+STARPUFFT(start_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
			
 
				+{
			
 
				+	return STARPUFFT(start1dC2C)(plan, in, out);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
			
 
				+{
			
 
				+	memset(task_per_worker, 0, sizeof(task_per_worker));
			
 
				+	memset(samples_per_worker, 0, sizeof(task_per_worker));
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	struct starpu_task *task = STARPUFFT(start)(plan, in, out);
			
 
				+	gettimeofday(&submit_tasks, NULL);
			
 
				+	starpu_task_wait(task);
			
 
				+
			
 
				+	STARPUFFT(cleanup)(plan);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+STARPUFFT(execute_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
			
 
				+{
			
 
				+	struct starpu_task *task = STARPUFFT(start_handle)(plan, in, out);
			
 
				+	starpu_task_wait(task);
			
 
				+}
			
 
				+
			
 
				+/* Destroy FFTW plans, unregister and free buffers, and free tags */
			
 
				+void
			
 
				+STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	int workerid, dim, i;
			
 
				+
			
 
				+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				+	{
			
 
				+		switch (starpu_worker_get_type(workerid))
			
 
				+		{
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+			if (PARALLEL)
			
 
				+			{
			
 
				+				_FFTW(destroy_plan)(plan->plans[workerid].plan1_cpu);
			
 
				+				_FFTW(destroy_plan)(plan->plans[workerid].plan2_cpu);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				_FFTW(destroy_plan)(plan->plans[workerid].plan_cpu);
			
 
				+			}
			
 
				+#endif
			
 
				+			break;
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			/* FIXME: Can't deallocate */
			
 
				+#endif
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* Do not care, we won't be executing anything there. */
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (PARALLEL)
			
 
				+	{
			
 
				+		for (i = 0; i < plan->totsize1; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(plan->twisted1_handle[i]);
			
 
				+			free(plan->twist1_tasks[i]);
			
 
				+			starpu_data_unregister(plan->fft1_handle[i]);
			
 
				+			free(plan->fft1_tasks[i]);
			
 
				+		}
			
 
				+
			
 
				+		free(plan->twisted1_handle);
			
 
				+		free(plan->twist1_tasks);
			
 
				+		free(plan->fft1_handle);
			
 
				+		free(plan->fft1_tasks);
			
 
				+		free(plan->fft1_args);
			
 
				+
			
 
				+		free(plan->join_task);
			
 
				+
			
 
				+		for (i = 0; i < plan->totsize3; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(plan->twisted2_handle[i]);
			
 
				+			free(plan->twist2_tasks[i]);
			
 
				+			starpu_data_unregister(plan->fft2_handle[i]);
			
 
				+			free(plan->fft2_tasks[i]);
			
 
				+			free(plan->twist3_tasks[i]);
			
 
				+		}
			
 
				+
			
 
				+		free(plan->end_task);
			
 
				+
			
 
				+		free(plan->twisted2_handle);
			
 
				+		free(plan->twist2_tasks);
			
 
				+		free(plan->fft2_handle);
			
 
				+		free(plan->fft2_tasks);
			
 
				+		free(plan->twist3_tasks);
			
 
				+		free(plan->fft2_args);
			
 
				+
			
 
				+		for (dim = 0; dim < plan->dim; dim++)
			
 
				+		{
			
 
				+			starpu_data_unregister(plan->roots_handle[dim]);
			
 
				+			free(plan->roots[dim]);
			
 
				+		}
			
 
				+
			
 
				+		switch (plan->dim)
			
 
				+		{
			
 
				+		case 1:
			
 
				+			STARPUFFT(free_1d_tags)(plan);
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			STARPUFFT(free_2d_tags)(plan);
			
 
				+			break;
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		free(plan->n1);
			
 
				+		free(plan->n2);
			
 
				+		STARPUFFT(free)(plan->twisted1);
			
 
				+		STARPUFFT(free)(plan->fft1);
			
 
				+		STARPUFFT(free)(plan->twisted2);
			
 
				+		STARPUFFT(free)(plan->fft2);
			
 
				+	}
			
 
				+	free(plan->n);
			
 
				+	free(plan);
			
 
				+}
			
 
				+
			
 
				+void *
			
 
				+STARPUFFT(malloc)(size_t n)
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	void *res;
			
 
				+	starpu_malloc(&res, n);
			
 
				+	return res;
			
 
				+#else
			
 
				+#  ifdef STARPU_HAVE_FFTW
			
 
				+	return _FFTW(malloc)(n);
			
 
				+#  else
			
 
				+	return malloc(n);
			
 
				+#  endif
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+STARPUFFT(free)(void *p)
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	starpu_free(p);
			
 
				+#else
			
 
				+#  ifdef STARPU_HAVE_FFTW
			
 
				+	_FFTW(free)(p);
			
 
				+#  else
			
 
				+	free(p);
			
 
				+#  endif
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+STARPUFFT(showstats)(FILE *out)
			
 
				+{
			
 
				+	int worker;
			
 
				+	unsigned total;
			
 
				+
			
 
				+#define TIMING(begin,end) (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec))
			
 
				+#define MSTIMING(begin,end) (TIMING(begin,end)/1000.)
			
 
				+	double paratiming = TIMING(start,end);
			
 
				+	fprintf(out, "Tasks submission took %2.2f ms\n", MSTIMING(start,submit_tasks));
			
 
				+	fprintf(out, "Tasks termination took %2.2f ms\n", MSTIMING(submit_tasks,end));
			
 
				+
			
 
				+	fprintf(out, "Total %2.2f ms\n", MSTIMING(start,end));
			
 
				+
			
 
				+	for (worker = 0, total = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+		total += task_per_worker[worker];
			
 
				+
			
 
				+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				+		if (task_per_worker[worker])
			
 
				+		{
			
 
				+			char name[32];
			
 
				+			starpu_worker_get_name(worker, name, sizeof(name));
			
 
				+
			
 
				+			unsigned long bytes = sizeof(STARPUFFT(complex))*samples_per_worker[worker];
			
 
				+
			
 
				+			fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%u %2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/paratiming, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/starpufft/starpufftx1d.c
+++ b/starpufft/starpufftx1d.c
@@ -0,0 +1,847 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ *
			
 
				+ * Dumb parallel version
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#define DIV_1D 64
			
 
				+
			
 
				+  /*
			
 
				+   * Overall strategy for an fft of size n:
			
 
				+   * - perform n1 ffts of size n2
			
 
				+   * - twiddle
			
 
				+   * - perform n2 ffts of size n1
			
 
				+   *
			
 
				+   * - n1 defaults to DIV_1D, thus n2 defaults to n / DIV_1D.
			
 
				+   *
			
 
				+   * Precise tasks:
			
 
				+   *
			
 
				+   * - twist1: twist the whole n-element input (called "in") into n1 chunks of
			
 
				+   *           size n2, by using n1 tasks taking the whole n-element input as a
			
 
				+   *           R parameter and one n2 output as a W parameter. The result is
			
 
				+   *           called twisted1.
			
 
				+   * - fft1:   perform n1 (n2) ffts, by using n1 tasks doing one fft each. Also
			
 
				+   *           twiddle the result to prepare for the fft2. The result is called
			
 
				+   *           fft1.
			
 
				+   * - join:   depends on all the fft1s, to gather the n1 results of size n2 in
			
 
				+   *           the fft1 vector.
			
 
				+   * - twist2: twist the fft1 vector into n2 chunks of size n1, called twisted2.
			
 
				+   *           since n2 is typically very large, this step is divided in DIV_1D
			
 
				+   *           tasks, each of them performing n2/DIV_1D of them
			
 
				+   * - fft2:   perform n2 ffts of size n1. This is divided in DIV_1D tasks of
			
 
				+   *           n2/DIV_1D ffts, to be performed in batches. The result is called
			
 
				+   *           fft2.
			
 
				+   * - twist3: twist back the result of the fft2s above into the output buffer.
			
 
				+   *           Only implemented on CPUs for simplicity of the gathering.
			
 
				+   *
			
 
				+   * The tag space thus uses 3 dimensions:
			
 
				+   * - the number of the plan.
			
 
				+   * - the step (TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END)
			
 
				+   * - an index i between 0 and DIV_1D-1.
			
 
				+   */
			
 
				+
			
 
				+#define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+/* twist1:
			
 
				+ *
			
 
				+ * Twist the full input vector (first parameter) into one chunk of size n2
			
 
				+ * (second parameter) */
			
 
				+static void
			
 
				+STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	
			
 
				+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+/* fft1:
			
 
				+ *
			
 
				+ * Perform one fft of size n2 */
			
 
				+static void
			
 
				+STARPUFFT(fft1_1d_plan_gpu)(void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	const _cufftComplex * restrict roots = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+
			
 
				+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+/* fft2:
			
 
				+ *
			
 
				+ * Perform n3 = n2/DIV_1D ffts of size n1 */
			
 
				+static void
			
 
				+STARPUFFT(fft2_1d_plan_gpu)(void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int n3 = n2/DIV_1D;
			
 
				+	cufftResult cures;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	/* NOTE using batch support */
			
 
				+	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* twist1:
			
 
				+ *
			
 
				+ * Twist the full input vector (first parameter) into one chunk of size n2
			
 
				+ * (second parameter) */
			
 
				+static void
			
 
				+STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("twist1 %d %g\n", i, (double) cabs(plan->in[i])); */
			
 
				+
			
 
				+	for (j = 0; j < n2; j++)
			
 
				+		twisted1[j] = in[i+j*n1];
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* fft1:
			
 
				+ *
			
 
				+ * Perform one fft of size n2 */
			
 
				+static void
			
 
				+STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("fft1 %d %g\n", i, (double) cabs(twisted1[0])); */
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
			
 
				+
			
 
				+	/* twiddle fft1 buffer */
			
 
				+	for (j = 0; j < n2; j++)
			
 
				+		fft1[j] = fft1[j] * plan->roots[0][i*j];
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* twist2:
			
 
				+ *
			
 
				+ * Twist the full vector (results of the fft1s) into one package of n2/DIV_1D
			
 
				+ * chunks of size n1 */
			
 
				+static void
			
 
				+STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int jj = args->jj;	/* between 0 and DIV_1D */
			
 
				+	int jjj;		/* beetween 0 and n3 */
			
 
				+	int i;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int n3 = n2/DIV_1D;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	/* printf("twist2 %d %g\n", jj, (double) cabs(plan->fft1[jj])); */
			
 
				+
			
 
				+	for (jjj = 0; jjj < n3; jjj++) {
			
 
				+		int j = jj * n3 + jjj;
			
 
				+		for (i = 0; i < n1; i++)
			
 
				+			twisted2[jjj*n1+i] = plan->fft1[i*n2+j];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* fft2:
			
 
				+ *
			
 
				+ * Perform n3 = n2/DIV_1D ffts of size n1 */
			
 
				+static void
			
 
				+STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	/* int jj = args->jj; */
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("fft2 %d %g\n", jj, (double) cabs(twisted2[plan->totsize4-1])); */
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* twist3:
			
 
				+ *
			
 
				+ * Spread the package of n2/DIV_1D chunks of size n1 into the output vector */
			
 
				+static void
			
 
				+STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int jj = args->jj;	/* between 0 and DIV_1D */
			
 
				+	int jjj;		/* beetween 0 and n3 */
			
 
				+	int i;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int n3 = n2/DIV_1D;
			
 
				+
			
 
				+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	/* printf("twist3 %d %g\n", jj, (double) cabs(fft2[0])); */
			
 
				+
			
 
				+	for (jjj = 0; jjj < n3; jjj++) {
			
 
				+		int j = jj * n3 + jjj;
			
 
				+		for (i = 0; i < n1; i++)
			
 
				+			plan->out[i*n2+j] = fft2[jjj*n1+i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Performance models for the 5 kinds of tasks */
			
 
				+static struct starpu_perfmodel STARPUFFT(twist1_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist1_1d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(fft1_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft1_1d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(twist2_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist2_1d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(fft2_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft2_1d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(twist3_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist3_1d"
			
 
				+};
			
 
				+
			
 
				+/* codelet pointers for the 5 kinds of tasks */
			
 
				+static struct starpu_codelet STARPUFFT(twist1_1d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+		STARPU_CPU,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(twist1_1d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {STARPUFFT(twist1_1d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist1_1d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft1_1d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft1_1d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft1_1d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft1_1d_model),
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_W, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(twist2_1d_codelet) = {
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {STARPUFFT(twist2_1d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist2_1d_model),
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft2_1d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft2_1d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft2_1d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft2_1d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {STARPUFFT(twist3_1d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist3_1d_model),
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *
			
 
				+ * Sequential version
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+/* Perform one fft of size n */
			
 
				+static void
			
 
				+STARPUFFT(fft_1d_plan_gpu)(void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	cufftResult cures;
			
 
				+	int n = plan->n[0];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	cures = cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft_1d_kernel_gpu)(void *descr[], void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* Perform one fft of size n */
			
 
				+static void
			
 
				+STARPUFFT(fft_1d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = _args;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(fft_1d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft_1d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft_1d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft_1d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft_1d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft_1d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+/* Planning:
			
 
				+ *
			
 
				+ * - For each CPU worker, we need to plan the two fftw stages.
			
 
				+ * - For GPU workers, we need to do the planning in the CUDA context, so we do
			
 
				+ *   this lazily through the initialised1 and initialised2 flags ; TODO: use
			
 
				+ *   starpu_execute_on_each_worker instead (done in the omp branch).
			
 
				+ * - We allocate all the temporary buffers and register them to starpu.
			
 
				+ * - We create all the tasks, but do not submit them yet. It will be possible
			
 
				+ *   to reuse them at will to perform several ffts with the same planning.
			
 
				+ */
			
 
				+STARPUFFT(plan)
			
 
				+STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
			
 
				+{
			
 
				+	int workerid;
			
 
				+	int n1 = DIV_1D;
			
 
				+	int n2 = n / n1;
			
 
				+	int n3;
			
 
				+	int z;
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	/* cufft 1D limited to 8M elements */
			
 
				+	while (n2 > 8 << 20) {
			
 
				+		n1 *= 2;
			
 
				+		n2 /= 2;
			
 
				+	}
			
 
				+#endif
			
 
				+	STARPU_ASSERT(n == n1*n2);
			
 
				+	STARPU_ASSERT(n1 < (1ULL << I_BITS));
			
 
				+
			
 
				+	/* distribute the n2 second ffts into DIV_1D packages */
			
 
				+	n3 = n2 / DIV_1D;
			
 
				+	STARPU_ASSERT(n2 == n3*DIV_1D);
			
 
				+}
			
 
				+
			
 
				+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
			
 
				+	STARPU_ASSERT(flags == 0);
			
 
				+
			
 
				+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
			
 
				+	memset(plan, 0, sizeof(*plan));
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
			
 
				+
			
 
				+	/* The plan number has a limited size */
			
 
				+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
			
 
				+}
			
 
				+
			
 
				+	/* Just one dimension */
			
 
				+	plan->dim = 1;
			
 
				+	plan->n = malloc(plan->dim * sizeof(*plan->n));
			
 
				+	plan->n[0] = n;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	check_dims(plan);
			
 
				+
			
 
				+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
			
 
				+	plan->n1[0] = n1;
			
 
				+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
			
 
				+	plan->n2[0] = n2;
			
 
				+}
			
 
				+
			
 
				+	/* Note: this is for coherency with the 2D case */
			
 
				+	plan->totsize = n;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	plan->totsize1 = n1;
			
 
				+	plan->totsize2 = n2;
			
 
				+	plan->totsize3 = DIV_1D;
			
 
				+	plan->totsize4 = plan->totsize / plan->totsize3;
			
 
				+}
			
 
				+	plan->type = C2C;
			
 
				+	plan->sign = sign;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	/* Compute the w^k just once. */
			
 
				+	compute_roots(plan);
			
 
				+}
			
 
				+
			
 
				+	/* Initialize per-worker working set */
			
 
				+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
			
 
				+		switch (starpu_worker_get_type(workerid)) {
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+if (PARALLEL) {
			
 
				+			/* first fft plan: one fft of size n2.
			
 
				+			 * FFTW imposes that buffer pointers are known at
			
 
				+			 * planning time. */
			
 
				+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, NULL, (void*) 1, sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
			
 
				+
			
 
				+			/* second fft plan: n3 ffts of size n1 */
			
 
				+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
			
 
				+					plan->n1, n3,
			
 
				+					NULL, NULL, 1, plan->totsize1,
			
 
				+					(void*) 1, NULL, 1, plan->totsize1,
			
 
				+					sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
			
 
				+} else {
			
 
				+			/* fft plan: one fft of size n. */
			
 
				+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
			
 
				+}
			
 
				+#else
			
 
				+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
			
 
				+#endif
			
 
				+			break;
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* Do not care, we won't be executing anything there. */
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+if (PARALLEL) {
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);
			
 
				+} else {
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft_1d_plan_gpu), plan, STARPU_CUDA);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	/* Allocate buffers. */
			
 
				+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
			
 
				+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
			
 
				+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
			
 
				+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
			
 
				+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
			
 
				+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
			
 
				+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
			
 
				+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
			
 
				+
			
 
				+	/* Allocate handle arrays */
			
 
				+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
			
 
				+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
			
 
				+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
			
 
				+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
			
 
				+
			
 
				+	/* Allocate task arrays */
			
 
				+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
			
 
				+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
			
 
				+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
			
 
				+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
			
 
				+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
			
 
				+
			
 
				+	/* Allocate codelet argument arrays */
			
 
				+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
			
 
				+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
			
 
				+
			
 
				+	/* Create first-round tasks: DIV_1D tasks of type twist1 and fft1 */
			
 
				+	for (z = 0; z < plan->totsize1; z++) {
			
 
				+		int i = z;
			
 
				+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
			
 
				+
			
 
				+		/* TODO: get rid of tags */
			
 
				+
			
 
				+		plan->fft1_args[z].plan = plan;
			
 
				+		plan->fft1_args[z].i = i;
			
 
				+
			
 
				+		/* Register the twisted1 buffer of size n2. */
			
 
				+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
			
 
				+		/* Register the fft1 buffer of size n2. */
			
 
				+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
			
 
				+
			
 
				+		/* We'll need the result of fft1 on the CPU for the second
			
 
				+		 * twist anyway, so tell starpu to not keep the fft1 buffer in
			
 
				+		 * the GPU. */
			
 
				+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
			
 
				+
			
 
				+		/* Create twist1 task */
			
 
				+		plan->twist1_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist1_1d_codelet);
			
 
				+		/* task->handles[0] = to be filled at execution to point
			
 
				+		   to the application input. */
			
 
				+		task->handles[1] = plan->twisted1_handle[z];
			
 
				+		task->cl_arg = &plan->fft1_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST1);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that fft1 depends on twisted1 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(FFT1),
			
 
				+				1, STEP_TAG(TWIST1));
			
 
				+
			
 
				+		/* Create FFT1 task */
			
 
				+		plan->fft1_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(fft1_1d_codelet);
			
 
				+		task->handles[0] = plan->twisted1_handle[z];
			
 
				+		task->handles[1] = plan->fft1_handle[z];
			
 
				+		task->handles[2] = plan->roots_handle[0];
			
 
				+		task->cl_arg = &plan->fft1_args[z];
			
 
				+		task->tag_id = STEP_TAG(FFT1);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that the join task will depend on the fft1 task. */
			
 
				+		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
			
 
				+				1, STEP_TAG(FFT1));
			
 
				+#undef STEP_TAG
			
 
				+	}
			
 
				+
			
 
				+	/* Create the join task, only serving as a dependency point between
			
 
				+	 * fft1 and twist2 tasks */
			
 
				+	plan->join_task = task = starpu_task_create();
			
 
				+	task->cl = NULL;
			
 
				+	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
			
 
				+	task->use_tag = 1;
			
 
				+	task->destroy = 0;
			
 
				+
			
 
				+	/* Create second-round tasks: DIV_1D batches of n2/DIV_1D twist2, fft2,
			
 
				+	 * and twist3 */
			
 
				+	for (z = 0; z < plan->totsize3; z++) {
			
 
				+		int jj = z;
			
 
				+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
			
 
				+
			
 
				+		plan->fft2_args[z].plan = plan;
			
 
				+		plan->fft2_args[z].jj = jj;
			
 
				+
			
 
				+		/* Register n3 twisted2 buffers of size n1 */
			
 
				+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
			
 
				+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
			
 
				+
			
 
				+		/* We'll need the result of fft2 on the CPU for the third
			
 
				+		 * twist anyway, so tell starpu to not keep the fft2 buffer in
			
 
				+		 * the GPU. */
			
 
				+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
			
 
				+
			
 
				+		/* Tell that twisted2 depends on the join task */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
			
 
				+				1, STEP_TAG_1D(plan, JOIN, 0));
			
 
				+
			
 
				+		/* Create twist2 task */
			
 
				+		plan->twist2_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist2_1d_codelet);
			
 
				+		task->handles[0] = plan->twisted2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST2);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that fft2 depends on twisted2 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(FFT2),
			
 
				+				1, STEP_TAG(TWIST2));
			
 
				+
			
 
				+		/* Create FFT2 task */
			
 
				+		plan->fft2_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(fft2_1d_codelet);
			
 
				+		task->handles[0] = plan->twisted2_handle[z];
			
 
				+		task->handles[1] = plan->fft2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(FFT2);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that twist3 depends on fft2 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
			
 
				+				1, STEP_TAG(FFT2));
			
 
				+
			
 
				+		/* Create twist3 tasks */
			
 
				+		/* These run only on CPUs and thus write directly into the
			
 
				+		 * application output buffer. */
			
 
				+		plan->twist3_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist3_1d_codelet);
			
 
				+		task->handles[0] = plan->fft2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST3);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that to be completely finished we need to have finished
			
 
				+		 * this twisted3 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
			
 
				+				1, STEP_TAG(TWIST3));
			
 
				+#undef STEP_TAG
			
 
				+	}
			
 
				+
			
 
				+	/* Create end task, only serving as a join point. */
			
 
				+	plan->end_task = task = starpu_task_create();
			
 
				+	task->cl = NULL;
			
 
				+	task->tag_id = STEP_TAG_1D(plan, END, 0);
			
 
				+	task->use_tag = 1;
			
 
				+	task->destroy = 0;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+	return plan;
			
 
				+}
			
 
				+
			
 
				+/* Actually submit all the tasks. */
			
 
				+static struct starpu_task *
			
 
				+STARPUFFT(start1dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
			
 
				+{
			
 
				+	STARPU_ASSERT(plan->type == C2C);
			
 
				+	int z;
			
 
				+	int ret;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	for (z=0; z < plan->totsize1; z++) {
			
 
				+		ret = starpu_task_submit(plan->twist1_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->fft1_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_submit(plan->join_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	for (z=0; z < plan->totsize3; z++) {
			
 
				+		ret = starpu_task_submit(plan->twist2_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->fft2_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->twist3_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_submit(plan->end_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	return plan->end_task;
			
 
				+} else /* !PARALLEL */ {
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	/* Create FFT task */
			
 
				+	task = starpu_task_create();
			
 
				+	task->detach = 0;
			
 
				+	task->cl = &STARPUFFT(fft_1d_codelet);
			
 
				+	task->handles[0] = in;
			
 
				+	task->handles[1] = out;
			
 
				+	task->cl_arg = plan;
			
 
				+
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	return task;
			
 
				+}
			
 
				+}
			
 
				+
			
 
				+/* Free all the tags. The generic code handles freeing the buffers. */
			
 
				+static void
			
 
				+STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	int n1 = plan->n1[0];
			
 
				+
			
 
				+	if (!PARALLEL)
			
 
				+		return;
			
 
				+
			
 
				+	for (i = 0; i < n1; i++) {
			
 
				+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST1, i));
			
 
				+		starpu_tag_remove(STEP_TAG_1D(plan, FFT1, i));
			
 
				+	}
			
 
				+
			
 
				+	starpu_tag_remove(STEP_TAG_1D(plan, JOIN, 0));
			
 
				+
			
 
				+	for (i = 0; i < DIV_1D; i++) {
			
 
				+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST2, i));
			
 
				+		starpu_tag_remove(STEP_TAG_1D(plan, FFT2, i));
			
 
				+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST3, i));
			
 
				+	}
			
 
				+
			
 
				+	starpu_tag_remove(STEP_TAG_1D(plan, END, 0));
			
 
				+}
			
--- a/starpufft/starpufftx2d.c
+++ b/starpufft/starpufftx2d.c
@@ -0,0 +1,850 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define DIV_2D_N 8
			
 
				+#define DIV_2D_M 8
			
 
				+
			
 
				+#define I_SHIFT (I_BITS/2)
			
 
				+#define J_BITS I_SHIFT
			
 
				+
			
 
				+#define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+/* Twist the full vector into a n2,m2 chunk */
			
 
				+static void
			
 
				+STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j = args->j;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	int m2 = plan->n2[1];
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+/* fft1:
			
 
				+ *
			
 
				+ * Perform one fft of size n2,m2 */
			
 
				+static void
			
 
				+STARPUFFT(fft1_2d_plan_gpu)(void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j = args->j;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	const _cufftComplex * restrict roots0 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				+	const _cufftComplex * restrict roots1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[3]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+
			
 
				+	/* synchronization is done after the twiddling */
			
 
				+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+/* fft2:
			
 
				+ *
			
 
				+ * Perform n3*m3 ffts of size n1,m1 */
			
 
				+static void
			
 
				+STARPUFFT(fft2_2d_plan_gpu(void *args))
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	cufftResult cures;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int n3 = n2/DIV_2D_N;
			
 
				+	int m3 = m2/DIV_2D_M;
			
 
				+	int n;
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	for (n = 0; n < n3*m3; n++) {
			
 
				+		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	}
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Twist the full vector into a n2,m2 chunk */
			
 
				+static void
			
 
				+STARPUFFT(twist1_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j = args->j;
			
 
				+	int k, l;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int m = plan->n[1];
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("twist1 %d %d %g\n", i, j, (double) cabs(plan->in[i+j])); */
			
 
				+
			
 
				+	for (k = 0; k < n2; k++)
			
 
				+		for (l = 0; l < m2; l++)
			
 
				+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* Perform an n2,m2 fft */
			
 
				+static void
			
 
				+STARPUFFT(fft1_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int i = args->i;
			
 
				+	int j = args->j;
			
 
				+	int k, l;
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) *twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) *fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("fft1 %d %d %g\n", i, j, (double) cabs(twisted1[0])); */
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
			
 
				+	for (k = 0; k < n2; k++)
			
 
				+		for (l = 0; l < m2; l++)
			
 
				+			fft1[k*m2 + l] = fft1[k*m2 + l] * plan->roots[0][i*k] * plan->roots[1][j*l];
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Twist the full vector into a package of n2/DIV_2D_N,m2/DIV_2D_M (n1,m1) chunks */
			
 
				+static void
			
 
				+STARPUFFT(twist2_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int kk = args->kk;	/* between 0 and DIV_2D_N */
			
 
				+	int ll = args->ll;	/* between 0 and DIV_2D_M */
			
 
				+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
			
 
				+	int i, j;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int n3 = n2/DIV_2D_N;
			
 
				+	int m3 = m2/DIV_2D_M;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	/* printf("twist2 %d %d %g\n", kk, ll, (double) cabs(plan->fft1[kk+ll])); */
			
 
				+
			
 
				+	for (kkk = 0; kkk < n3; kkk++) {
			
 
				+		int k = kk * n3 + kkk;
			
 
				+		for (lll = 0; lll < m3; lll++) {
			
 
				+			int l = ll * m3 + lll;
			
 
				+			for (i = 0; i < n1; i++)
			
 
				+				for (j = 0; j < m1; j++)
			
 
				+					twisted2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j] = plan->fft1[i*n1*n2*m2+j*n2*m2+k*m2+l];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* Perform (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) ffts */
			
 
				+static void
			
 
				+STARPUFFT(fft2_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	/* int kk = args->kk; */
			
 
				+	/* int ll = args->ll; */
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) *twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) *fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* printf("fft2 %d %d %g\n", kk, ll, (double) cabs(twisted2[plan->totsize4-1])); */
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Spread the package of (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) chunks into the full vector */
			
 
				+static void
			
 
				+STARPUFFT(twist3_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	struct STARPUFFT(args) *args = _args;
			
 
				+	STARPUFFT(plan) plan = args->plan;
			
 
				+	int kk = args->kk;	/* between 0 and DIV_2D_N */
			
 
				+	int ll = args->ll;	/* between 0 and DIV_2D_M */
			
 
				+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
			
 
				+	int i, j;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int n2 = plan->n2[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+	int m2 = plan->n2[1];
			
 
				+	int n3 = n2/DIV_2D_N;
			
 
				+	int m3 = m2/DIV_2D_M;
			
 
				+	int m = plan->n[1];
			
 
				+
			
 
				+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	/* printf("twist3 %d %d %g\n", kk, ll, (double) cabs(fft2[0])); */
			
 
				+
			
 
				+	for (kkk = 0; kkk < n3; kkk++) {
			
 
				+		int k = kk * n3 + kkk;
			
 
				+		for (lll = 0; lll < m3; lll++) {
			
 
				+			int l = ll * m3 + lll;
			
 
				+			for (i = 0; i < n1; i++)
			
 
				+				for (j = 0; j < m1; j++)
			
 
				+					plan->out[i*n2*m+j*m2+k*m+l] = fft2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_perfmodel STARPUFFT(twist1_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist1_2d"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel STARPUFFT(fft1_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft1_2d"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel STARPUFFT(twist2_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist2_2d"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel STARPUFFT(fft2_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft2_2d"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel STARPUFFT(twist3_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"twist3_2d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(twist1_2d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+		STARPU_CPU,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(twist1_2d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {STARPUFFT(twist1_2d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist1_2d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft1_2d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft1_2d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft1_2d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft1_2d_model),
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_W, STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(twist2_2d_codelet) = {
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {STARPUFFT(twist2_2d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist2_2d_model),
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft2_2d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft2_2d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft2_2d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft2_2d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {STARPUFFT(twist3_2d_kernel_cpu), NULL},
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(twist3_2d_model),
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *
			
 
				+ * Sequential version
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+/* Perform one fft of size n,m */
			
 
				+static void
			
 
				+STARPUFFT(fft_2d_plan_gpu)(void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	cufftResult cures;
			
 
				+	int n = plan->n[0];
			
 
				+	int m = plan->n[1];
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	cures = cufftPlan2d(&plan->plans[workerid].plan_cuda, n, m, _CUFFT_C2C);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+STARPUFFT(fft_2d_kernel_gpu)(void *descr[], void *args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = args;
			
 
				+	cufftResult cures;
			
 
				+
			
 
				+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+/* Perform one fft of size n,m */
			
 
				+static void
			
 
				+STARPUFFT(fft_2d_kernel_cpu)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPUFFT(plan) plan = _args;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	task_per_worker[workerid]++;
			
 
				+
			
 
				+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_perfmodel STARPUFFT(fft_2d_model) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = TYPE"fft_2d"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet STARPUFFT(fft_2d_codelet) = {
			
 
				+	.where =
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+		STARPU_CPU|
			
 
				+#endif
			
 
				+		0,
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPUFFT(fft_2d_kernel_gpu), NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+	.cpu_funcs = {STARPUFFT(fft_2d_kernel_cpu), NULL},
			
 
				+#endif
			
 
				+	CAN_EXECUTE
			
 
				+	.model = &STARPUFFT(fft_2d_model),
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+STARPUFFT(plan)
			
 
				+STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
			
 
				+{
			
 
				+	int workerid;
			
 
				+	int n1 = DIV_2D_N;
			
 
				+	int n2 = n / n1;
			
 
				+	int n3;
			
 
				+	int m1 = DIV_2D_M;
			
 
				+	int m2 = m / m1;
			
 
				+	int m3;
			
 
				+	int z;
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	/*
			
 
				+	 * Simple strategy:
			
 
				+	 *
			
 
				+	 * - twist1: twist input in n1*m1 (n2,m2) chunks
			
 
				+	 * - fft1:   perform n1*m1 (n2,m2) ffts
			
 
				+	 * - twist2: twist into n2*m2 (n1,m1) chunks distributed in
			
 
				+	 *           DIV_2D_N*DIV_2D_M groups
			
 
				+	 * - fft2:   perform DIV_2D_N*DIV_2D_M times n3*m3 (n1,m1) ffts
			
 
				+	 * - twist3: twist back into output
			
 
				+	 */
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	/* cufft 2D-3D limited to [2,16384] */
			
 
				+	while (n2 > 16384) {
			
 
				+		n1 *= 2;
			
 
				+		n2 /= 2;
			
 
				+	}
			
 
				+#endif
			
 
				+	STARPU_ASSERT(n == n1*n2);
			
 
				+	STARPU_ASSERT(n1 < (1ULL << J_BITS));
			
 
				+
			
 
				+
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+	/* cufft 2D-3D limited to [2,16384] */
			
 
				+	while (m2 > 16384) {
			
 
				+		m1 *= 2;
			
 
				+		m2 /= 2;
			
 
				+	}
			
 
				+#endif
			
 
				+	STARPU_ASSERT(m == m1*m2);
			
 
				+	STARPU_ASSERT(m1 < (1ULL << J_BITS));
			
 
				+
			
 
				+	/* distribute the n2*m2 second ffts into DIV_2D_N*DIV_2D_M packages */
			
 
				+	n3 = n2 / DIV_2D_N;
			
 
				+	STARPU_ASSERT(n2 == n3*DIV_2D_N);
			
 
				+	m3 = m2 / DIV_2D_M;
			
 
				+	STARPU_ASSERT(m2 == m3*DIV_2D_M);
			
 
				+}
			
 
				+
			
 
				+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
			
 
				+	STARPU_ASSERT(flags == 0);
			
 
				+
			
 
				+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
			
 
				+	memset(plan, 0, sizeof(*plan));
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
			
 
				+
			
 
				+	/* 4bit limitation in the tag space */
			
 
				+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
			
 
				+}
			
 
				+
			
 
				+	plan->dim = 2;
			
 
				+	plan->n = malloc(plan->dim * sizeof(*plan->n));
			
 
				+	plan->n[0] = n;
			
 
				+	plan->n[1] = m;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	check_dims(plan);
			
 
				+
			
 
				+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
			
 
				+	plan->n1[0] = n1;
			
 
				+	plan->n1[1] = m1;
			
 
				+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
			
 
				+	plan->n2[0] = n2;
			
 
				+	plan->n2[1] = m2;
			
 
				+}
			
 
				+
			
 
				+	plan->totsize = n * m;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	plan->totsize1 = n1 * m1;
			
 
				+	plan->totsize2 = n2 * m2;
			
 
				+	plan->totsize3 = DIV_2D_N * DIV_2D_M;
			
 
				+	plan->totsize4 = plan->totsize / plan->totsize3;
			
 
				+}
			
 
				+	plan->type = C2C;
			
 
				+	plan->sign = sign;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	/* Compute the w^k just once. */
			
 
				+	compute_roots(plan);
			
 
				+}
			
 
				+
			
 
				+	/* Initialize per-worker working set */
			
 
				+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
			
 
				+		switch (starpu_worker_get_type(workerid)) {
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+#ifdef STARPU_HAVE_FFTW
			
 
				+if (PARALLEL) {
			
 
				+			/* first fft plan: one n2*m2 fft */
			
 
				+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_2d)(n2, m2, NULL, (void*) 1, sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
			
 
				+
			
 
				+			/* second fft plan: n3*m3 n1*m1 ffts */
			
 
				+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
			
 
				+					plan->n1, n3*m3,
			
 
				+					NULL, NULL, 1, plan->totsize1,
			
 
				+					(void*) 1, NULL, 1, plan->totsize1,
			
 
				+					sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
			
 
				+} else {
			
 
				+			/* fft plan: one fft of size n, m. */
			
 
				+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, sign, _FFTW_FLAGS);
			
 
				+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
			
 
				+}
			
 
				+#else
			
 
				+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
			
 
				+#endif
			
 
				+			break;
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* Do not care, we won't be executing anything there. */
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+#ifdef __STARPU_USE_CUDA
			
 
				+if (PARALLEL) {
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);
			
 
				+} else {
			
 
				+	starpu_execute_on_each_worker(STARPUFFT(fft_2d_plan_gpu), plan, STARPU_CUDA);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	/* Allocate buffers. */
			
 
				+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
			
 
				+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
			
 
				+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
			
 
				+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
			
 
				+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
			
 
				+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
			
 
				+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
			
 
				+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
			
 
				+
			
 
				+	/* Allocate handle arrays */
			
 
				+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
			
 
				+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
			
 
				+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
			
 
				+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
			
 
				+
			
 
				+	/* Allocate task arrays */
			
 
				+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
			
 
				+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
			
 
				+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
			
 
				+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
			
 
				+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
			
 
				+
			
 
				+	/* Allocate codelet argument arrays */
			
 
				+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
			
 
				+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
			
 
				+
			
 
				+	/* Create first-round tasks */
			
 
				+	for (z = 0; z < plan->totsize1; z++) {
			
 
				+		int i = z / m1, j = z % m1;
			
 
				+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, i, j)
			
 
				+
			
 
				+		/* TODO: get rid of tags */
			
 
				+
			
 
				+		plan->fft1_args[z].plan = plan;
			
 
				+		plan->fft1_args[z].i = i;
			
 
				+		plan->fft1_args[z].j = j;
			
 
				+
			
 
				+		/* Register (n2,m2) chunks */
			
 
				+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
			
 
				+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
			
 
				+
			
 
				+		/* We'll need it on the CPU for the second twist anyway */
			
 
				+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
			
 
				+
			
 
				+		/* Create twist1 task */
			
 
				+		plan->twist1_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist1_2d_codelet);
			
 
				+		/* task->handles[0] = to be filled at execution */
			
 
				+		task->handles[1] = plan->twisted1_handle[z];
			
 
				+		task->cl_arg = &plan->fft1_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST1);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that fft1 depends on twisted1 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(FFT1),
			
 
				+				1, STEP_TAG(TWIST1));
			
 
				+
			
 
				+		/* Create FFT1 task */
			
 
				+		plan->fft1_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(fft1_2d_codelet);
			
 
				+		task->handles[0] = plan->twisted1_handle[z];
			
 
				+		task->handles[1] = plan->fft1_handle[z];
			
 
				+		task->handles[2] = plan->roots_handle[0];
			
 
				+		task->handles[3] = plan->roots_handle[1];
			
 
				+		task->cl_arg = &plan->fft1_args[z];
			
 
				+		task->tag_id = STEP_TAG(FFT1);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that to be done with first step we need to have
			
 
				+		 * finished this fft1 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG_2D(plan, JOIN, 0, 0),
			
 
				+				1, STEP_TAG(FFT1));
			
 
				+#undef STEP_TAG
			
 
				+	}
			
 
				+
			
 
				+	/* Create join task */
			
 
				+	plan->join_task = task = starpu_task_create();
			
 
				+	task->cl = NULL;
			
 
				+	task->tag_id = STEP_TAG_2D(plan, JOIN, 0, 0);
			
 
				+	task->use_tag = 1;
			
 
				+	task->destroy = 0;
			
 
				+
			
 
				+	/* Create second-round tasks */
			
 
				+	for (z = 0; z < plan->totsize3; z++) {
			
 
				+		int kk = z / DIV_2D_M, ll = z % DIV_2D_M;
			
 
				+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, kk, ll)
			
 
				+
			
 
				+		plan->fft2_args[z].plan = plan;
			
 
				+		plan->fft2_args[z].kk = kk;
			
 
				+		plan->fft2_args[z].ll = ll;
			
 
				+
			
 
				+		/* Register n3*m3 (n1,m1) chunks */
			
 
				+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
			
 
				+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
			
 
				+
			
 
				+		/* We'll need it on the CPU for the last twist anyway */
			
 
				+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
			
 
				+
			
 
				+		/* Tell that twisted2 depends on the whole first step to be
			
 
				+		 * done */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
			
 
				+				1, STEP_TAG_2D(plan, JOIN, 0, 0));
			
 
				+
			
 
				+		/* Create twist2 task */
			
 
				+		plan->twist2_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist2_2d_codelet);
			
 
				+		task->handles[0] = plan->twisted2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST2);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that fft2 depends on twisted2 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(FFT2),
			
 
				+				1, STEP_TAG(TWIST2));
			
 
				+
			
 
				+		/* Create FFT2 task */
			
 
				+		plan->fft2_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(fft2_2d_codelet);
			
 
				+		task->handles[0] = plan->twisted2_handle[z];
			
 
				+		task->handles[1] = plan->fft2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(FFT2);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that twist3 depends on fft2 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
			
 
				+				1, STEP_TAG(FFT2));
			
 
				+
			
 
				+		/* Create twist3 tasks */
			
 
				+		/* These run only on CPUs and thus write directly into the
			
 
				+		 * application output buffer. */
			
 
				+		plan->twist3_tasks[z] = task = starpu_task_create();
			
 
				+		task->cl = &STARPUFFT(twist3_2d_codelet);
			
 
				+		task->handles[0] = plan->fft2_handle[z];
			
 
				+		task->cl_arg = &plan->fft2_args[z];
			
 
				+		task->tag_id = STEP_TAG(TWIST3);
			
 
				+		task->use_tag = 1;
			
 
				+		task->destroy = 0;
			
 
				+
			
 
				+		/* Tell that to be completely finished we need to have finished this twisted3 */
			
 
				+		starpu_tag_declare_deps(STEP_TAG_2D(plan, END, 0, 0),
			
 
				+				1, STEP_TAG(TWIST3));
			
 
				+#undef STEP_TAG
			
 
				+	}
			
 
				+
			
 
				+	/* Create end task */
			
 
				+	plan->end_task = task = starpu_task_create();
			
 
				+	task->cl = NULL;
			
 
				+	task->tag_id = STEP_TAG_2D(plan, END, 0, 0);
			
 
				+	task->use_tag = 1;
			
 
				+	task->destroy = 0;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+	return plan;
			
 
				+}
			
 
				+
			
 
				+/* Actually submit all the tasks. */
			
 
				+static struct starpu_task *
			
 
				+STARPUFFT(start2dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
			
 
				+{
			
 
				+	STARPU_ASSERT(plan->type == C2C);
			
 
				+	int z;
			
 
				+	int ret;
			
 
				+
			
 
				+if (PARALLEL) {
			
 
				+	for (z=0; z < plan->totsize1; z++) {
			
 
				+		ret = starpu_task_submit(plan->twist1_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->fft1_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_submit(plan->join_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	for (z=0; z < plan->totsize3; z++) {
			
 
				+		ret = starpu_task_submit(plan->twist2_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->fft2_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		ret = starpu_task_submit(plan->twist3_tasks[z]);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_submit(plan->end_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	return plan->end_task;
			
 
				+} else /* !PARALLEL */ {
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	/* Create FFT task */
			
 
				+	task = starpu_task_create();
			
 
				+	task->detach = 0;
			
 
				+	task->cl = &STARPUFFT(fft_2d_codelet);
			
 
				+	task->handles[0] = in;
			
 
				+	task->handles[1] = out;
			
 
				+	task->cl_arg = plan;
			
 
				+
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	return task;
			
 
				+}
			
 
				+}
			
 
				+
			
 
				+/* Free all the tags. The generic code handles freeing the buffers. */
			
 
				+static void
			
 
				+STARPUFFT(free_2d_tags)(STARPUFFT(plan) plan)
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+	int n1 = plan->n1[0];
			
 
				+	int m1 = plan->n1[1];
			
 
				+
			
 
				+	if (!PARALLEL)
			
 
				+		return;
			
 
				+
			
 
				+	for (i = 0; i < n1; i++) {
			
 
				+		for (j = 0; j < m1; j++) {
			
 
				+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST1, i, j));
			
 
				+			starpu_tag_remove(STEP_TAG_2D(plan, FFT1, i, j));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_tag_remove(STEP_TAG_2D(plan, JOIN, 0, 0));
			
 
				+
			
 
				+	for (i = 0; i < DIV_2D_N; i++) {
			
 
				+		for (j = 0; j < DIV_2D_M; j++) {
			
 
				+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST2, i, j));
			
 
				+			starpu_tag_remove(STEP_TAG_2D(plan, FFT2, i, j));
			
 
				+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST3, i, j));
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_tag_remove(STEP_TAG_2D(plan, END, 0, 0));
			
 
				+}
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,8 +1,8 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				-# Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+# Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012  Institut National de Recherche en Informatique et Automatique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,15 +16,28 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
			
 
				+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/src -I$(top_srcdir)/src/
			
 
				 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
 
				 EXTRA_DIST =					\
			
 
				+	helper.h				\
			
 
				+	datawizard/scal.h			\
			
 
				 	microbenchs/null_kernel_gordon.c	\
			
 
				 	datawizard/sync_and_notify_data_gordon_kernels.c \
			
 
				 	datawizard/sync_and_notify_data_opencl_codelet.cl\
			
 
				-	coverage/coverage.sh
			
 
				+	coverage/coverage.sh			\
			
 
				+	datawizard/interfaces/test_interfaces.h	\
			
 
				+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/variable/variable_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/multiformat/multiformat_types.h \
			
 
				+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl \
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.h \
			
 
				+	datawizard/interfaces/csr/csr_opencl_kernel.cl \
			
 
				+	datawizard/interfaces/block/block_opencl_kernel.cl
			
 
				 
			
 
				 CLEANFILES = 					\
			
 
				 	*.gcno *.gcda *.linkinfo		\
			
@@ -43,7 +56,7 @@ if STARPU_USE_CUDA
 
				 # TODO define NVCCFLAGS
			
 
				 NVCC ?= nvcc
			
 
				 
			
 
				-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include $(HWLOC_CFLAGS)
			
 
				+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include $(HWLOC_CFLAGS)
			
 
				 
			
 
				 .cu.cubin:
			
 
				 	$(MKDIR_P) `dirname $@`
			
@@ -83,7 +96,7 @@ if !STARPU_HAVE_WINDOWS
 
				 ## test loader program
			
 
				 LOADER			=	loader
			
 
				 LOADER_BIN		=	$(abs_top_builddir)/tests/$(LOADER)
			
 
				-TESTS_ENVIRONMENT	=	$(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 TESTS = $(noinst_PROGRAMS)
			
@@ -92,31 +105,39 @@ if STARPU_COVERAGE_ENABLED
 
				 TESTS	+=	coverage/coverage.sh
			
 
				 endif
			
 
				 
			
 
				+starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
			
 
				+
			
 
				 noinst_PROGRAMS =				\
			
 
				-	core/restart				\
			
 
				-	core/execute_on_a_specific_worker	\
			
 
				-	core/insert_task			\
			
 
				-	core/multithreaded			\
			
 
				-	core/multithreaded_init			\
			
 
				-	core/starpu_task_wait_for_all		\
			
 
				-	core/starpu_task_wait			\
			
 
				-	core/static_restartable			\
			
 
				-	core/static_restartable_using_initializer\
			
 
				-	core/static_restartable_tag		\
			
 
				-	core/regenerate				\
			
 
				-	core/wait_all_regenerable_tasks		\
			
 
				-	core/subgraph_repeat			\
			
 
				-	core/subgraph_repeat_regenerate		\
			
 
				-	core/empty_task				\
			
 
				-	core/empty_task_sync_point		\
			
 
				-	core/empty_task_sync_point_tasks	\
			
 
				-	core/empty_task_chain			\
			
 
				-	core/tag_wait_api			\
			
 
				-	core/task_wait_api			\
			
 
				-	core/declare_deps_in_callback		\
			
 
				-	core/declare_deps_after_submission	\
			
 
				-	core/declare_deps_after_submission_synchronous	\
			
 
				-	core/get_current_task			\
			
 
				+	starpu_machine_display			\
			
 
				+	main/deprecated_func			\
			
 
				+	main/deprecated_buffer			\
			
 
				+	main/restart				\
			
 
				+	main/execute_on_a_specific_worker	\
			
 
				+	main/insert_task			\
			
 
				+	main/multithreaded			\
			
 
				+	main/multithreaded_init			\
			
 
				+	main/starpu_task_bundle			\
			
 
				+	main/starpu_task_wait_for_all		\
			
 
				+	main/starpu_task_wait			\
			
 
				+	main/static_restartable			\
			
 
				+	main/static_restartable_using_initializer\
			
 
				+	main/static_restartable_tag		\
			
 
				+	main/regenerate				\
			
 
				+	main/wait_all_regenerable_tasks		\
			
 
				+	main/subgraph_repeat			\
			
 
				+	main/subgraph_repeat_regenerate		\
			
 
				+	main/empty_task				\
			
 
				+	main/empty_task_sync_point		\
			
 
				+	main/empty_task_sync_point_tasks	\
			
 
				+	main/empty_task_chain			\
			
 
				+	main/tag_wait_api			\
			
 
				+	main/task_wait_api			\
			
 
				+	main/declare_deps_in_callback		\
			
 
				+	main/declare_deps_after_submission	\
			
 
				+	main/declare_deps_after_submission_synchronous	\
			
 
				+	main/get_current_task			\
			
 
				+	main/starpu_init			\
			
 
				+	main/starpu_worker_exists               \
			
 
				 	datawizard/acquire_cb			\
			
 
				 	datawizard/acquire_cb_insert		\
			
 
				 	datawizard/acquire_release		\
			
@@ -128,6 +149,7 @@ noinst_PROGRAMS =				\
 
				 	datawizard/sync_and_notify_data		\
			
 
				 	datawizard/sync_and_notify_data_implicit\
			
 
				 	datawizard/dsm_stress			\
			
 
				+	datawizard/double_parameter		\
			
 
				 	datawizard/write_only_tmp_buffer	\
			
 
				 	datawizard/data_invalidation		\
			
 
				 	datawizard/dining_philosophers		\
			
@@ -144,8 +166,26 @@ noinst_PROGRAMS =				\
 
				 	datawizard/critical_section_with_void_interface\
			
 
				 	datawizard/increment_redux		\
			
 
				 	datawizard/increment_redux_v2		\
			
 
				+	datawizard/increment_redux_lazy		\
			
 
				 	datawizard/handle_to_pointer		\
			
 
				 	datawizard/lazy_allocation		\
			
 
				+	datawizard/interfaces/copy_interfaces	\
			
 
				+	datawizard/interfaces/block/block_interface \
			
 
				+	datawizard/interfaces/bcsr/bcsr_interface \
			
 
				+	datawizard/interfaces/csr/csr_interface \
			
 
				+	datawizard/interfaces/matrix/matrix_interface \
			
 
				+	datawizard/interfaces/multiformat/multiformat_interface \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_data_release \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_worker \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion \
			
 
				+	datawizard/interfaces/multiformat/advanced/same_handle \
			
 
				+	datawizard/interfaces/variable/variable_interface    \
			
 
				+	datawizard/interfaces/vector/test_vector_interface   \
			
 
				+	datawizard/interfaces/void/void_interface \
			
 
				+	datawizard/in_place_partition   	\
			
 
				+	datawizard/partition_lazy		\
			
 
				+	datawizard/gpu_register   		\
			
 
				 	errorcheck/starpu_init_noworker		\
			
 
				 	errorcheck/invalid_blocking_calls	\
			
 
				 	errorcheck/invalid_tasks		\
			
@@ -165,7 +205,7 @@ noinst_PROGRAMS =				\
 
				 	parallel_tasks/parallel_kernels		\
			
 
				 	parallel_tasks/parallel_kernels_spmd	\
			
 
				 	perfmodels/regression_based		\
			
 
				-	perfmodels/non_linear_regression_based
			
 
				+	perfmodels/non_linear_regression_based 
			
 
				 
			
 
				 if STARPU_HAVE_WINDOWS
			
 
				 check_PROGRAMS = $(noinst_PROGRAMS)
			
@@ -236,6 +276,42 @@ datawizard_sync_and_notify_data_implicit_SOURCES +=	\
 
				 	datawizard/sync_and_notify_data_opencl.c
			
 
				 endif
			
 
				 
			
 
				+datawizard_in_place_partition_SOURCES =	\
			
 
				+	datawizard/in_place_partition.c	\
			
 
				+	datawizard/scal.c
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_in_place_partition_SOURCES +=	\
			
 
				+	datawizard/scal_cuda.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_in_place_partition_SOURCES +=	\
			
 
				+	datawizard/scal_opencl.cl
			
 
				+endif
			
 
				+
			
 
				+datawizard_partition_lazy_SOURCES =	\
			
 
				+	datawizard/partition_lazy.c	\
			
 
				+	datawizard/scal.c
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_partition_lazy_SOURCES +=	\
			
 
				+	datawizard/scal_cuda.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_partition_lazy_SOURCES +=	\
			
 
				+	datawizard/scal_opencl.cl
			
 
				+endif
			
 
				+
			
 
				+datawizard_gpu_register_SOURCES =	\
			
 
				+	datawizard/gpu_register.c	\
			
 
				+	datawizard/scal.c
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_gpu_register_SOURCES +=	\
			
 
				+	datawizard/scal_cuda.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_gpu_register_SOURCES +=	\
			
 
				+	datawizard/scal_opencl.cl
			
 
				+endif
			
 
				+
			
 
				 if STARPU_USE_GORDON
			
 
				 datawizard_sync_and_notify_data_SOURCES +=	\
			
 
				 	datawizard/sync_and_notify_data_gordon_kernels.c
			
@@ -245,3 +321,167 @@ BUILT_SOURCES += 						\
 
				 	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
			
 
				 	microbenchs/null_kernel_gordon.spuelf
			
 
				 endif
			
 
				+
			
 
				+###################
			
 
				+# Block interface #
			
 
				+###################
			
 
				+datawizard_interfaces_block_block_interface_SOURCES= \
			
 
				+	datawizard/interfaces/test_interfaces.c  \
			
 
				+	datawizard/interfaces/block/block_interface.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_block_block_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/block/block_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_block_block_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/block/block_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	datawizard/interfaces/block/block_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+##################
			
 
				+# BSCR interface #
			
 
				+##################
			
 
				+datawizard_interfaces_bcsr_bcsr_interface_SOURCES= \
			
 
				+	datawizard/interfaces/test_interfaces.c \
			
 
				+	datawizard/interfaces/bcsr/bcsr_interface.c 
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/bcsr/bcsr_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/bcsr/bcsr_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+#################
			
 
				+# CSR interface #
			
 
				+#################
			
 
				+datawizard_interfaces_csr_csr_interface_SOURCES= \
			
 
				+	datawizard/interfaces/test_interfaces.c  \
			
 
				+	datawizard/interfaces/csr/csr_interface.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_csr_csr_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/csr/csr_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_csr_csr_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/csr/csr_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	datawizard/interfaces/csr/csr_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+
			
 
				+datawizard_interfaces_vector_test_vector_interface_SOURCES =               \
			
 
				+	datawizard/interfaces/vector/test_vector_interface.c               \
			
 
				+	datawizard/interfaces/test_interfaces.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
			
 
				+	datawizard/interfaces/vector/test_vector_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
			
 
				+	datawizard/interfaces/vector/test_vector_opencl.c 
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+####################
			
 
				+# Matrix interface #
			
 
				+####################
			
 
				+datawizard_interfaces_matrix_matrix_interface_SOURCES= \
			
 
				+	datawizard/interfaces/test_interfaces.c        \
			
 
				+	datawizard/interfaces/matrix/matrix_interface.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/matrix/matrix_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/matrix/matrix_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA+= \
			
 
				+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+
			
 
				+#########################
			
 
				+# Multiformat interface #
			
 
				+#########################
			
 
				+datawizard_interfaces_multiformat_multiformat_interface_SOURCES =           \
			
 
				+	datawizard/interfaces/test_interfaces.c                             \
			
 
				+	datawizard/interfaces/multiformat/multiformat_interface.c           \
			
 
				+	datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
			
 
				+	datawizard/interfaces/multiformat/multiformat_cuda.cu                      \
			
 
				+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
			
 
				+	datawizard/interfaces/multiformat/multiformat_opencl.c                     \
			
 
				+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA +=                                                          \
			
 
				+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl                     \
			
 
				+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+datawizard_interfaces_multiformat_advanced_multiformat_cuda_opencl_SOURCES=\
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.c               \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl.c
			
 
				+
			
 
				+datawizard_interfaces_multiformat_advanced_multiformat_data_release_SOURCES = \
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.c                  \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
			
 
				+
			
 
				+datawizard_interfaces_multiformat_advanced_multiformat_worker_SOURCES=\
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.c               \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_worker.c
			
 
				+
			
 
				+datawizard_interfaces_multiformat_advanced_multiformat_handle_conversion_SOURCES = \
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.c \
			
 
				+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion.c
			
 
				+
			
 
				+datawizard_interfaces_multiformat_advanced_same_handle_SOURCES= \
			
 
				+	datawizard/interfaces/multiformat/advanced/generic.c               \
			
 
				+	datawizard/interfaces/multiformat/advanced/same_handle.c
			
 
				+
			
 
				+
			
 
				+datawizard_interfaces_variable_variable_interface_SOURCES=   \
			
 
				+	datawizard/interfaces/test_interfaces.c              \
			
 
				+	datawizard/interfaces/variable/variable_interface.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_interfaces_variable_variable_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/variable/variable_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_interfaces_variable_variable_interface_SOURCES+= \
			
 
				+	datawizard/interfaces/variable/variable_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	datawizard/interfaces/variable/variable_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				+##################
			
 
				+# Void interface #
			
 
				+##################
			
 
				+datawizard_interfaces_void_void_interface_SOURCES=\
			
 
				+	datawizard/interfaces/test_interfaces.c        \
			
 
				+	datawizard/interfaces/void/void_interface.c
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
--- a/tests/cholesky/prio.r
+++ b/tests/cholesky/prio.r
@@ -1,3 +1,20 @@
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				 sizelist <- seq(2048, 24576, 2048);
			
 
				 schedlist <- c("greedy", "prio", "dm", "random");
			
 
				 
			
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 
				 	filename = paste("timings_sched/sched", sched, size, sep=".");
			
 
				 
			
 
				 	if (file.exists(filename))
			
 
				-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
			
 
				+	{
			
 
				+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
			
 
				 		return(ret);
			
 
				 	};
			
 
				 
			
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 
				 	gflopstab <- NULL;
			
 
				 	sizetab <- NULL;
			
 
				 
			
 
				-	for (size in sizelist) {
			
 
				+	for (size in sizelist)
			
 
				+	{
			
 
				 		list <- handle_size(size, sched);
			
 
				 		gflopstab <- c(gflopstab, list);
			
 
				 		sizetab <- c(sizetab, array(size, c(length(list))));
			
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 
				 	meantab <- NULL;
			
 
				 	sizetab <- NULL;
			
 
				 
			
 
				-	for (size in sizelist) {
			
 
				+	for (size in sizelist)
			
 
				+	{
			
 
				 		list <- mean(handle_size(size, sched));
			
 
				 		meantab <- c(meantab, list);
			
 
				 		sizetab <- c(sizetab, array(size, c(length(list))));
			
--- a/tests/cholesky/sched.r
+++ b/tests/cholesky/sched.r
@@ -1,3 +1,20 @@
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				 sizelist <- seq(2048, 24576, 2048);
			
 
				 schedlist <- c("greedy", "prio", "dm", "random");
			
 
				 
			
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 
				 	filename = paste("timings_sched/sched", sched, size, sep=".");
			
 
				 
			
 
				 	if (file.exists(filename))
			
 
				-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
			
 
				+	{
			
 
				+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
			
 
				 		return(ret);
			
 
				 	};
			
 
				 
			
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 
				 	gflopstab <- NULL;
			
 
				 	sizetab <- NULL;
			
 
				 
			
 
				-	for (size in sizelist) {
			
 
				+	for (size in sizelist)
			
 
				+	{
			
 
				 		list <- handle_size(size, sched);
			
 
				 		gflopstab <- c(gflopstab, list);
			
 
				 		sizetab <- c(sizetab, array(size, c(length(list))));
			
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 
				 	meantab <- NULL;
			
 
				 	sizetab <- NULL;
			
 
				 
			
 
				-	for (size in sizelist) {
			
 
				+	for (size in sizelist)
			
 
				+	{
			
 
				 		list <- mean(handle_size(size, sched));
			
 
				 		meantab <- c(meantab, list);
			
 
				 		sizetab <- c(sizetab, array(size, c(length(list))));
			
--- a/tests/core/multithreaded_init.c
+++ b/tests/core/multithreaded_init.c
@@ -1,65 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-#include <sys/time.h>
			
 
				-#include <stdio.h>
			
 
				-#include <pthread.h>
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#define NUM_THREADS 5
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-void *launch_starpu(void *id)
			
 
				-{ 
			
 
				-   starpu_init(NULL);
			
 
				-   return NULL;
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{ 
			
 
				-  unsigned i;
			
 
				-  double timing;
			
 
				-  struct timeval start;
			
 
				-  struct timeval end;
			
 
				-
			
 
				-  pthread_t threads[NUM_THREADS];
			
 
				-  
			
 
				-  gettimeofday(&start, NULL);
			
 
				-
			
 
				-  for (i = 0; i < NUM_THREADS; ++i)
			
 
				-    {
			
 
				-      int ret = pthread_create(&threads[i], NULL, launch_starpu, NULL);
			
 
				-      STARPU_ASSERT(ret == 0);
			
 
				-    }
			
 
				-
			
 
				-  for (i = 0; i < NUM_THREADS; ++i)
			
 
				-    {
			
 
				-      int ret = pthread_join(threads[i], NULL);
			
 
				-      STARPU_ASSERT(ret == 0);
			
 
				-    }
			
 
				-
			
 
				-  gettimeofday(&end, NULL);
			
 
				-
			
 
				-  timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-
			
 
				-  FPRINTF(stderr, "Success : %d threads launching simultaneously starpu_init\n", NUM_THREADS);
			
 
				-  FPRINTF(stderr, "Total: %f secs\n", timing/1000000);
			
 
				-  FPRINTF(stderr, "Per task: %f usecs\n", timing/NUM_THREADS);
			
 
				-
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  return 0;
			
 
				-}
			
--- a/tests/core/task_wait_api.c
+++ b/tests/core/task_wait_api.c
@@ -1,121 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <pthread.h>
			
 
				-#include <stdio.h>
			
 
				-#include <unistd.h>
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				-static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet dummy_codelet =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_func = dummy_func,
			
 
				-	.cuda_func = dummy_func,
			
 
				-	.opencl_func = dummy_func,
			
 
				-        .model = NULL,
			
 
				-	.nbuffers = 0
			
 
				-};
			
 
				-
			
 
				-static struct starpu_task *create_dummy_task(void)
			
 
				-{
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	task->cl = &dummy_codelet;
			
 
				-	task->cl_arg = NULL;
			
 
				-	task->detach = 0;
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	FPRINTF(stderr, "{ A } -> { B }\n");
			
 
				-	fflush(stderr);
			
 
				-
			
 
				-	struct starpu_task *taskA, *taskB;
			
 
				-
			
 
				-	taskA = create_dummy_task();
			
 
				-	taskB = create_dummy_task();
			
 
				-
			
 
				-	/* B depends on A */
			
 
				-	starpu_task_declare_deps_array(taskB, 1, &taskA);
			
 
				-
			
 
				-	starpu_task_submit(taskB);
			
 
				-	starpu_task_submit(taskA);
			
 
				-
			
 
				-	starpu_task_wait(taskB);
			
 
				-
			
 
				-	FPRINTF(stderr, "{ C, D, E, F } -> { G }\n");
			
 
				-
			
 
				-	struct starpu_task *taskC, *taskD, *taskE, *taskF, *taskG;
			
 
				-
			
 
				-	taskC = create_dummy_task();
			
 
				-	taskD = create_dummy_task();
			
 
				-	taskE = create_dummy_task();
			
 
				-	taskF = create_dummy_task();
			
 
				-	taskG = create_dummy_task();
			
 
				-
			
 
				-	struct starpu_task *tasksCDEF[4] = {taskC, taskD, taskE, taskF};
			
 
				-	starpu_task_declare_deps_array(taskG, 4, tasksCDEF);
			
 
				-
			
 
				-	starpu_task_submit(taskC);
			
 
				-	starpu_task_submit(taskD);
			
 
				-	starpu_task_submit(taskG);
			
 
				-	starpu_task_submit(taskE);
			
 
				-	starpu_task_submit(taskF);
			
 
				-
			
 
				-	starpu_task_wait(taskG);
			
 
				-
			
 
				-	FPRINTF(stderr, "{ H, I } -> { J, K, L }\n");
			
 
				-
			
 
				-	struct starpu_task *taskH, *taskI, *taskJ, *taskK, *taskL;
			
 
				-
			
 
				-	taskH = create_dummy_task();
			
 
				-	taskI = create_dummy_task();
			
 
				-	taskJ = create_dummy_task();
			
 
				-	taskK = create_dummy_task();
			
 
				-	taskL = create_dummy_task();
			
 
				-
			
 
				-	struct starpu_task *tasksHI[2] = {taskH, taskI};
			
 
				-
			
 
				-	starpu_task_declare_deps_array(taskJ, 2, tasksHI);
			
 
				-	starpu_task_declare_deps_array(taskK, 2, tasksHI);
			
 
				-	starpu_task_declare_deps_array(taskL, 2, tasksHI);
			
 
				-
			
 
				-	starpu_task_submit(taskH);
			
 
				-	starpu_task_submit(taskI);
			
 
				-	starpu_task_submit(taskJ);
			
 
				-	starpu_task_submit(taskK);
			
 
				-	starpu_task_submit(taskL);
			
 
				-
			
 
				-	starpu_task_wait(taskJ);
			
 
				-	starpu_task_wait(taskK);
			
 
				-	starpu_task_wait(taskL);
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/tests/datawizard/acquire_cb.c
+++ b/tests/datawizard/acquire_cb.c
@@ -15,11 +15,10 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 unsigned token = 0;
			
 
				-starpu_data_handle token_handle;
			
 
				+starpu_data_handle_t token_handle;
			
 
				 
			
 
				 void callback(void *arg __attribute__ ((unused)))
			
 
				 {
			
@@ -29,7 +28,11 @@ void callback(void *arg __attribute__ ((unused)))
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-        starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
			
 
				         starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
			
@@ -41,5 +44,5 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 }
			
--- a/tests/datawizard/acquire_cb_insert.c
+++ b/tests/datawizard/acquire_cb_insert.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,30 +14,38 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+#warning memory leak
			
 
				 
			
 
				 #define N 16
			
 
				 #define M 4
			
 
				 #define X 2
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				 void which_index_cpu(void *descr[], void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	/* A real case would actually compute something */
			
 
				 	*x0 = X;
			
 
				 }
			
 
				 
			
 
				-starpu_codelet which_index = {
			
 
				+struct starpu_codelet which_index =
			
 
				+{
			
 
				 	.where = STARPU_CPU,
			
 
				-	.cpu_func = which_index_cpu,
			
 
				-        .nbuffers = 1
			
 
				+	.cpu_funcs = {which_index_cpu, NULL},
			
 
				+        .nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 void work_cpu(void *descr[], void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	int i, n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	float *x0 = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
@@ -45,16 +53,19 @@ void work_cpu(void *descr[], void *_args)
 
				 		x0[i] = i + 1;
			
 
				 }
			
 
				 
			
 
				-starpu_codelet work = {
			
 
				+struct starpu_codelet work =
			
 
				+{
			
 
				 	.where = STARPU_CPU,
			
 
				-	.cpu_func = work_cpu,
			
 
				-        .nbuffers = 1
			
 
				+	.cpu_funcs = {work_cpu, NULL},
			
 
				+        .nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static int x;
			
 
				-static starpu_data_handle x_handle, f_handle;
			
 
				+static starpu_data_handle_t x_handle, f_handle;
			
 
				 
			
 
				-void callback(void *arg) {
			
 
				+void callback(void *arg)
			
 
				+{
			
 
				 	starpu_insert_task(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
			
 
				 	starpu_data_release(x_handle);
			
 
				 }
			
@@ -64,18 +75,22 @@ int main(int argc, char **argv)
 
				         int i, ret;
			
 
				 	float *f;
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Declare x */
			
 
				 	starpu_variable_data_register(&x_handle, 0, (uintptr_t)&x, sizeof(x));
			
 
				 
			
 
				 	/* Allocate and Declare f */
			
 
				-	starpu_malloc((void**)&f, N * sizeof(*f));
			
 
				+	ret = starpu_malloc((void**)&f, N * sizeof(*f));
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				 	memset(f, 0, N * sizeof(*f));
			
 
				 	starpu_vector_data_register(&f_handle, 0, (uintptr_t)f, N, sizeof(*f));
			
 
				 
			
 
				 	/* Partition f */
			
 
				-	struct starpu_data_filter filter = {
			
 
				+	struct starpu_data_filter filter =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				 		.nchildren = M,
			
 
				 	};
			
@@ -84,6 +99,7 @@ int main(int argc, char **argv)
 
				 	/* Compute which portion we will work on */
			
 
				         ret = starpu_insert_task(&which_index, STARPU_W, x_handle, 0);
			
 
				 	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 	/* And submit the corresponding task */
			
 
				 #ifdef __GCC__
			
@@ -96,30 +112,32 @@ int main(int argc, char **argv)
 
				 	starpu_data_acquire_cb(x_handle, STARPU_W, callback, NULL);
			
 
				 #endif
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 	starpu_data_unpartition(f_handle, 0);
			
 
				 	starpu_data_unregister(f_handle);
			
 
				 	starpu_data_unregister(x_handle);
			
 
				 
			
 
				         FPRINTF(stderr, "VALUES: %d", x);
			
 
				-
			
 
				-        for(i=0 ; i<N ; i++) {
			
 
				+        for(i=0 ; i<N ; i++)
			
 
				+	{
			
 
				 		FPRINTF(stderr, " %f", f[i]);
			
 
				         }
			
 
				-
			
 
				-	STARPU_ASSERT(f[X*(N/M)] == 1);
			
 
				-	STARPU_ASSERT(f[X*(N/M)+1] == 2);
			
 
				-	STARPU_ASSERT(f[X*(N/M)+2] == 3);
			
 
				-	STARPU_ASSERT(f[X*(N/M)+3] == 4);
			
 
				-
			
 
				 	FPRINTF(stderr, "\n");
			
 
				 
			
 
				+	ret = EXIT_SUCCESS;
			
 
				+	if (f[X*(N/M)] != 1 || f[X*(N/M)+1] != 2 ||
			
 
				+	    f[X*(N/M)+2] != 3 || f[X*(N/M)+3] != 4)
			
 
				+		ret = EXIT_FAILURE;
			
 
				+
			
 
				+	starpu_free(f);
			
 
				 	starpu_shutdown();
			
 
				-	return 0;
			
 
				+	STARPU_RETURN(ret);
			
 
				 
			
 
				 enodev:
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 77;
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/acquire_release.c
+++ b/tests/datawizard/acquire_release.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,11 +15,15 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+static unsigned ntasks = 10;
			
 
				+#else
			
 
				 static unsigned ntasks = 10000;
			
 
				+#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
			
@@ -27,30 +31,35 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
				 
			
 
				 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	(*tokenptr)++;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet increment_cl = {
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				         .where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = increment_cpu,
			
 
				+	.cpu_funcs = {increment_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = increment_cuda,
			
 
				+	.cuda_funcs = {increment_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
 
				 unsigned token = 0;
			
 
				-starpu_data_handle token_handle;
			
 
				+starpu_data_handle_t token_handle;
			
 
				 
			
 
				-void increment_token()
			
 
				+int increment_token()
			
 
				 {
			
 
				+	int ret;
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				         task->synchronous = 1;
			
 
				 	task->cl = &increment_cl;
			
 
				-	task->buffers[0].handle = token_handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task);
			
 
				+	task->handles[0] = token_handle;
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 void callback(void *arg __attribute__ ((unused)))
			
@@ -61,8 +70,12 @@ void callback(void *arg __attribute__ ((unused)))
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				+	int ret;
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-        starpu_init(NULL);
			
 
				 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
			
 
				 
			
 
				         FPRINTF(stderr, "Token: %u\n", token);
			
@@ -70,21 +83,36 @@ int main(int argc, char **argv)
 
				 	for(i=0; i<ntasks; i++)
			
 
				 	{
			
 
				 		/* synchronize data in RAM */
			
 
				-                starpu_data_acquire(token_handle, STARPU_R);
			
 
				+                ret = starpu_data_acquire(token_handle, STARPU_R);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+
			
 
				                 token ++;
			
 
				                 starpu_data_release(token_handle);
			
 
				 
			
 
				-                increment_token();
			
 
				+                ret = increment_token();
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				-                starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
			
 
				+                ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(token_handle);
			
 
				 
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				         FPRINTF(stderr, "Token: %u\n", token);
			
 
				-        STARPU_ASSERT(token==ntasks*2);
			
 
				+	if (token == ntasks * 2)
			
 
				+		ret = EXIT_SUCCESS;
			
 
				+	else
			
 
				+		ret = EXIT_FAILURE;
			
 
				+	STARPU_RETURN(ret);
			
 
				 
			
 
				+enodev:
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				 	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/acquire_release2.c
+++ b/tests/datawizard/acquire_release2.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,9 +14,11 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+#warning memory leak
			
 
				 
			
 
				 static unsigned ntasks = 40000;
			
 
				 
			
@@ -26,30 +28,33 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
				 
			
 
				 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	(*tokenptr)++;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet increment_cl = {
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				         .where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = increment_cpu,
			
 
				+	.cpu_funcs = {increment_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = increment_cuda,
			
 
				+	.cuda_funcs = {increment_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
 
				 unsigned token = 0;
			
 
				-starpu_data_handle token_handle;
			
 
				+starpu_data_handle_t token_handle;
			
 
				 
			
 
				-void increment_token(int synchronous)
			
 
				+int increment_token(int synchronous)
			
 
				 {
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				         task->synchronous = synchronous;
			
 
				 	task->cl = &increment_cl;
			
 
				-	task->buffers[0].handle = token_handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task);
			
 
				+	task->handles[0] = token_handle;
			
 
				+	return starpu_task_submit(task);
			
 
				 }
			
 
				 
			
 
				 void callback(void *arg __attribute__ ((unused)))
			
@@ -62,8 +67,12 @@ void callback(void *arg __attribute__ ((unused)))
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				+	int ret;
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-        starpu_init(NULL);
			
 
				 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
			
 
				 
			
 
				         FPRINTF(stderr, "Token: %u\n", token);
			
@@ -74,16 +83,33 @@ int main(int argc, char **argv)
 
				 
			
 
				 	for(i=0; i<ntasks; i++)
			
 
				 	{
			
 
				-                starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
			
 
				-                increment_token(0);
			
 
				+                ret = starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
			
 
				+
			
 
				+                ret = increment_token(0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				                 starpu_data_acquire_cb(token_handle, STARPU_R, callback, NULL);  // send
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(token_handle);
			
 
				-        FPRINTF(stderr, "Token: %u\n", token);
			
 
				-        assert(token==ntasks);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+        FPRINTF(stderr, "Token: %u\n", token);
			
 
				+	if (token == ntasks)
			
 
				+		ret = EXIT_SUCCESS;
			
 
				+	else
			
 
				+		ret = EXIT_FAILURE;
			
 
				+	STARPU_RETURN(ret);
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/copy.c
+++ b/tests/datawizard/copy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,8 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 static unsigned nloops = 1000;
			
 
				 
			
@@ -25,37 +24,41 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 {
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cpu_codelet =
			
 
				+static struct starpu_codelet cpu_codelet =
			
 
				 {
			
 
				         .where = STARPU_CPU,
			
 
				-        .cpu_func = dummy_func,
			
 
				+        .cpu_funcs = {dummy_func, NULL},
			
 
				         .model = NULL,
			
 
				-        .nbuffers = 1
			
 
				+        .nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet gpu_codelet =
			
 
				+static struct starpu_codelet gpu_codelet =
			
 
				 {
			
 
				         .where = STARPU_CUDA|STARPU_OPENCL,
			
 
				-        .cuda_func = dummy_func,
			
 
				-        .opencl_func = dummy_func,
			
 
				+        .cuda_funcs = {dummy_func, NULL},
			
 
				+        .opencl_funcs = {dummy_func, NULL},
			
 
				         .model = NULL,
			
 
				-        .nbuffers = 1
			
 
				+        .nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				 };
			
 
				 
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				         float foo;
			
 
				-	starpu_data_handle float_array_handle;
			
 
				-        int i;
			
 
				+	starpu_data_handle_t float_array_handle;
			
 
				+        int i, ret;
			
 
				 
			
 
				-        starpu_init(NULL);
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
			
 
				 		starpu_shutdown();
			
 
				-		return 77;
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
 
				         foo = 0.0f;
			
@@ -71,32 +74,33 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task_cpu->cl = &cpu_codelet;
			
 
				 		task_cpu->callback_func = NULL;
			
 
				-		task_cpu->buffers[0].handle = float_array_handle;
			
 
				-		task_cpu->buffers[0].mode = STARPU_RW;
			
 
				+		task_cpu->handles[0] = float_array_handle;
			
 
				 
			
 
				 		task_gpu->cl = &gpu_codelet;
			
 
				 		task_gpu->callback_func = NULL;
			
 
				-		task_gpu->buffers[0].handle = float_array_handle;
			
 
				-		task_gpu->buffers[0].mode = STARPU_RW;
			
 
				+		task_gpu->handles[0] = float_array_handle;
			
 
				 
			
 
				 		ret = starpu_task_submit(task_cpu);
			
 
				-		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-		{
			
 
				-			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 		ret = starpu_task_submit(task_gpu);
			
 
				-		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-		{
			
 
				-			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				         }
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 	starpu_data_unregister(float_array_handle);
			
 
				         starpu_shutdown();
			
 
				 
			
 
				-        return 0;
			
 
				+        return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(float_array_handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/critical_section_with_void_interface.c
+++ b/tests/datawizard/critical_section_with_void_interface.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,29 +15,35 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <stdio.h>
			
 
				 #include <unistd.h>
			
 
				 #include <errno.h>
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				-starpu_data_handle void_handle;
			
 
				+starpu_data_handle_t void_handle;
			
 
				 
			
 
				 int critical_var;
			
 
				 
			
 
				 static void critical_section(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	/* We do not protect this variable because it is only accessed when the
			
 
				 	 * "void_handle" piece of data is accessed. */
			
 
				 	critical_var++;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_func = critical_section,
			
 
				-	.cuda_func = critical_section,
			
 
				-	.opencl_func = critical_section,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {critical_section, NULL},
			
 
				+	.cuda_funcs = {critical_section, NULL},
			
 
				+	.opencl_funcs = {critical_section, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -48,7 +55,9 @@ int main(int argc, char **argv)
 
				 	ntasks /= 10;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	critical_var = 0;
			
 
				 
			
@@ -59,13 +68,12 @@ int main(int argc, char **argv)
 
				 	for (i = 0; i < ntasks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-			task->cl = &cl;
			
 
				-			task->buffers[0].handle = void_handle;
			
 
				-			task->buffers[0].mode = STARPU_RW;
			
 
				-	
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = void_handle;
			
 
				+
			
 
				 		ret = starpu_task_submit(task);
			
 
				-		if (ret == -ENODEV)
			
 
				-			goto enodev;
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(void_handle);
			
@@ -74,11 +82,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 
			
 
				 enodev:
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 77;
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/data_implicit_deps.c
+++ b/tests/datawizard/data_implicit_deps.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,61 +15,78 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <stdio.h>
			
 
				 #include <unistd.h>
			
 
				 #include <errno.h>
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #define VECTORSIZE	1024
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 static unsigned *A, *B, *C, *D;
			
 
				-starpu_data_handle A_handle, B_handle, C_handle, D_handle;
			
 
				+starpu_data_handle_t A_handle, B_handle, C_handle, D_handle;
			
 
				 
			
 
				 static unsigned var = 0;
			
 
				 
			
 
				 static void f(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	usleep(200000);
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl_f = {
			
 
				+static struct starpu_codelet cl_f =
			
 
				+{
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = f,
			
 
				-	.cuda_func = f,
			
 
				+	.cpu_funcs = {f, NULL},
			
 
				+	.cuda_funcs = {f, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				 static void g(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	usleep(100000);
			
 
				 	var = 42;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl_g = {
			
 
				+static struct starpu_codelet cl_g =
			
 
				+{
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = g,
			
 
				-	.cuda_func = g,
			
 
				+	.cpu_funcs = {g, NULL},
			
 
				+	.cuda_funcs = {g, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				 static void h(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	FPRINTF(stderr, "VAR %u (should be 42)\n", var);
			
 
				 	STARPU_ASSERT(var == 42);
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl_h = {
			
 
				+static struct starpu_codelet cl_h =
			
 
				+{
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = h,
			
 
				-	.cuda_func = h,
			
 
				+	.cpu_funcs = {h, NULL},
			
 
				+	.cuda_funcs = {h, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	A = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
			
 
				 	B = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
			
@@ -81,12 +98,12 @@ int main(int argc, char **argv)
 
				 	starpu_vector_data_register(&C_handle, 0, (uintptr_t)C, VECTORSIZE, sizeof(unsigned));
			
 
				 	starpu_vector_data_register(&D_handle, 0, (uintptr_t)D, VECTORSIZE, sizeof(unsigned));
			
 
				 
			
 
				-	#if 0
			
 
				+#if 0
			
 
				 	starpu_data_set_sequential_consistency_flag(A_handle, 0);
			
 
				 	starpu_data_set_sequential_consistency_flag(B_handle, 0);
			
 
				 	starpu_data_set_sequential_consistency_flag(C_handle, 0);
			
 
				 	starpu_data_set_sequential_consistency_flag(D_handle, 0);
			
 
				-	#endif
			
 
				+#endif
			
 
				 
			
 
				 	/* 	f(Ar, Brw): sleep 
			
 
				 	 *	g(Br; Crw); sleep, var = 42
			
@@ -94,29 +111,35 @@ int main(int argc, char **argv)
 
				 	 */
			
 
				 	struct starpu_task *task_f = starpu_task_create();
			
 
				 	task_f->cl = &cl_f;
			
 
				-	task_f->buffers[0].handle = A_handle;
			
 
				-	task_f->buffers[0].mode = STARPU_R;
			
 
				-	task_f->buffers[1].handle = B_handle;
			
 
				-	task_f->buffers[1].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task_f);
			
 
				+	task_f->handles[0] = A_handle;
			
 
				+	task_f->handles[1] = B_handle;
			
 
				+	ret = starpu_task_submit(task_f);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	struct starpu_task *task_g = starpu_task_create();
			
 
				 	task_g->cl = &cl_g;
			
 
				-	task_g->buffers[0].handle = B_handle;
			
 
				-	task_g->buffers[0].mode = STARPU_R;
			
 
				-	task_g->buffers[1].handle = C_handle;
			
 
				-	task_g->buffers[1].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task_g);
			
 
				+	task_g->handles[0] = B_handle;
			
 
				+	task_g->handles[1] = C_handle;
			
 
				+	ret = starpu_task_submit(task_g);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	struct starpu_task *task_h = starpu_task_create();
			
 
				 	task_h->cl = &cl_h;
			
 
				-	task_h->buffers[0].handle = C_handle;
			
 
				-	task_h->buffers[0].mode = STARPU_R;
			
 
				-	task_h->buffers[1].handle = D_handle;
			
 
				-	task_h->buffers[1].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task_h);
			
 
				+	task_h->handles[0] = C_handle;
			
 
				+	task_h->handles[1] = D_handle;
			
 
				+	ret = starpu_task_submit(task_h);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(B_handle);
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+	starpu_data_unregister(D_handle);
			
 
				 
			
 
				 	free(A);
			
 
				 	free(B);
			
@@ -125,5 +148,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	free(A);
			
 
				+	free(B);
			
 
				+	free(C);
			
 
				+	free(D);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,18 +15,23 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <stdio.h>
			
 
				 #include <unistd.h>
			
 
				 #include <errno.h>
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <stdlib.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+#define NLOOPS		100
			
 
				+#else
			
 
				 #define NLOOPS		1000
			
 
				+#endif
			
 
				 #define VECTORSIZE	1024
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				-static starpu_data_handle v_handle;
			
 
				+static starpu_data_handle_t v_handle;
			
 
				 
			
 
				 /*
			
 
				  *	Memset
			
@@ -34,6 +40,8 @@ static starpu_data_handle v_handle;
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
@@ -44,19 +52,23 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 
				 
			
 
				 static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	memset(buf, 42, length);
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet memset_cl = {
			
 
				+static struct starpu_codelet memset_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = cpu_memset_codelet,
			
 
				+	.cpu_funcs = {cpu_memset_codelet, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = cuda_memset_codelet,
			
 
				+	.cuda_funcs = {cuda_memset_codelet, NULL},
			
 
				 #endif
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -65,6 +77,8 @@ static starpu_codelet memset_cl = {
 
				 
			
 
				 static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
@@ -79,10 +93,12 @@ static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) vo
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet check_content_cl = {
			
 
				+static struct starpu_codelet check_content_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU,
			
 
				-	.cpu_func = cpu_check_content_codelet,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {cpu_check_content_codelet, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				 };
			
 
				 
			
 
				 
			
@@ -90,7 +106,9 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* The buffer should never be explicitely allocated */
			
 
				 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
			
@@ -103,31 +121,27 @@ int main(int argc, char **argv)
 
				 
			
 
				 		memset_task = starpu_task_create();
			
 
				 		memset_task->cl = &memset_cl;
			
 
				-		memset_task->buffers[0].handle = v_handle;
			
 
				-		memset_task->buffers[0].mode = STARPU_W;
			
 
				+		memset_task->handles[0] = v_handle;
			
 
				 		memset_task->detach = 0;
			
 
				-	
			
 
				+
			
 
				 		ret = starpu_task_submit(memset_task);
			
 
				-		if (ret == -ENODEV)
			
 
				-				goto enodev;
			
 
				-	
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				 		ret = starpu_task_wait(memset_task);
			
 
				-		if (ret)
			
 
				-			exit(-1);
			
 
				-		
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
			
 
				+
			
 
				 		check_content_task = starpu_task_create();
			
 
				 		check_content_task->cl = &check_content_cl;
			
 
				-		check_content_task->buffers[0].handle = v_handle;
			
 
				-		check_content_task->buffers[0].mode = STARPU_R;
			
 
				+		check_content_task->handles[0] = v_handle;
			
 
				 		check_content_task->detach = 0;
			
 
				-	
			
 
				+
			
 
				 		ret = starpu_task_submit(check_content_task);
			
 
				-		if (ret == -ENODEV)
			
 
				-				goto enodev;
			
 
				-	
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				 		ret = starpu_task_wait(check_content_task);
			
 
				-		if (ret)
			
 
				-			exit(-1);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
			
 
				 
			
 
				 		starpu_data_invalidate(v_handle);
			
 
				 	}
			
@@ -137,11 +151,13 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 
			
 
				 enodev:
			
 
				+	starpu_data_unregister(v_handle);
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 77;
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/data_lookup.c
+++ b/tests/datawizard/data_lookup.c
@@ -20,6 +20,7 @@
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <sys/types.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 static void task(void **buffers, void *args)
			
 
				 {
			
@@ -27,17 +28,19 @@ static void task(void **buffers, void *args)
 
				 	size_t size, i;
			
 
				 
			
 
				 	numbers = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-	starpu_unpack_cl_args (args, &size);
			
 
				+	starpu_codelet_unpack_args (args, &size);
			
 
				 	for(i = 0; i < size; i++)
			
 
				 	{
			
 
				 		numbers[i] = i;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU,
			
 
				-	.cpu_func = task,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {task, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 static int test_lazy_allocation()
			
@@ -46,7 +49,7 @@ static int test_lazy_allocation()
 
				 
			
 
				 	size_t i;
			
 
				 	void *pointer;
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 	int ret;
			
 
				 
			
 
				 	/* Lazily-allocated vector.  */
			
@@ -58,28 +61,31 @@ static int test_lazy_allocation()
 
				 				 STARPU_VALUE, &count, sizeof(size_t),
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) return ret;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				 
			
 
				 	/* Acquire the handle, forcing a local allocation.  */
			
 
				-	starpu_data_acquire(handle, STARPU_R);
			
 
				+	ret = starpu_data_acquire(handle, STARPU_R);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				 
			
 
				 	/* Make sure we have a local pointer to it.  */
			
 
				 	pointer = starpu_handle_get_local_ptr(handle);
			
 
				-	assert(pointer != NULL);
			
 
				+	STARPU_ASSERT(pointer != NULL);
			
 
				 	for(i = 0; i < count; i++)
			
 
				 	{
			
 
				 		float *numbers = (float *)pointer;
			
 
				-		assert(numbers[i] == i);
			
 
				+		STARPU_ASSERT(numbers[i] == i);
			
 
				 	}
			
 
				 
			
 
				 	/* Make sure the pointer/handle mapping is up-to-date.  */
			
 
				-	assert(starpu_data_lookup(pointer) == handle);
			
 
				+	STARPU_ASSERT(starpu_data_lookup(pointer) == handle);
			
 
				 
			
 
				 	starpu_data_release(handle);
			
 
				 	starpu_data_unregister(handle);
			
 
				 
			
 
				-	assert(starpu_data_lookup(pointer) == NULL);
			
 
				+	STARPU_ASSERT(starpu_data_lookup(pointer) == NULL);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -91,12 +97,12 @@ static int test_lazy_allocation()
 
				 static void test_filters()
			
 
				 {
			
 
				 #define CHILDREN_COUNT 10
			
 
				-	int err, i;
			
 
				+	int ret, i;
			
 
				 	int *ptr, *children_pointers[CHILDREN_COUNT];
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 
			
 
				-	err = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
			
 
				-	assert(err == 0);
			
 
				+	ret = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				 
			
 
				 	starpu_vector_data_register(&handle, 0, (uintptr_t)ptr,
			
 
				 				    VECTOR_SIZE, sizeof(*ptr));
			
@@ -107,18 +113,18 @@ static void test_filters()
 
				 		.nchildren = CHILDREN_COUNT
			
 
				 	};
			
 
				 	starpu_data_partition(handle, &f);
			
 
				-	assert(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
			
 
				+	STARPU_ASSERT(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
			
 
				 
			
 
				 	for (i = 0; i < CHILDREN_COUNT; i++)
			
 
				 	{
			
 
				-                starpu_data_handle child;
			
 
				+                starpu_data_handle_t child;
			
 
				 
			
 
				 		child = starpu_data_get_sub_data(handle, 1, i);
			
 
				 		children_pointers[i] = (int *) starpu_handle_get_local_ptr(child);
			
 
				-		assert(children_pointers[i] != NULL);
			
 
				+		STARPU_ASSERT(children_pointers[i] != NULL);
			
 
				 
			
 
				 		/* Make sure we have a pointer -> handle mapping for CHILD.  */
			
 
				-		assert(starpu_data_lookup(children_pointers[i]) == child);
			
 
				+		STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == child);
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unpartition(handle, 0);
			
@@ -127,11 +133,11 @@ static void test_filters()
 
				 	{
			
 
				 		if (children_pointers[i] != ptr)
			
 
				 			/* Make sure the pointer -> handle mapping is gone.  */
			
 
				-			assert(starpu_data_lookup(children_pointers[i]) == NULL);
			
 
				+			STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == NULL);
			
 
				 	}
			
 
				 
			
 
				 	/* Make sure the parent's mapping is back.  */
			
 
				-	assert(starpu_data_lookup(ptr) == handle);
			
 
				+	STARPU_ASSERT(starpu_data_lookup(ptr) == handle);
			
 
				 
			
 
				 	starpu_data_unregister(handle);
			
 
				 	starpu_free(ptr);
			
@@ -141,20 +147,22 @@ static void test_filters()
 
				 
			
 
				 int main(int argc, char *argv[])
			
 
				 {
			
 
				-	int err;
			
 
				+	int ret;
			
 
				 	size_t i;
			
 
				 	void *vectors[VECTOR_COUNT], *variables[VARIABLE_COUNT];
			
 
				-	starpu_data_handle vector_handles[VECTOR_COUNT];
			
 
				-	starpu_data_handle variable_handles[VARIABLE_COUNT];
			
 
				+	starpu_data_handle_t vector_handles[VECTOR_COUNT];
			
 
				+	starpu_data_handle_t variable_handles[VARIABLE_COUNT];
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Register data regions.  */
			
 
				 
			
 
				 	for(i = 0; i < VARIABLE_COUNT; i++)
			
 
				 	{
			
 
				-		err = starpu_malloc(&variables[i], sizeof(float));
			
 
				-		assert(err == 0);
			
 
				+		ret = starpu_malloc(&variables[i], sizeof(float));
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				 		starpu_variable_data_register(&variable_handles[i], 0,
			
 
				 					      (uintptr_t)variables[i],
			
 
				 					      sizeof(float));
			
@@ -162,8 +170,8 @@ int main(int argc, char *argv[])
 
				 
			
 
				 	for(i = 0; i < VECTOR_COUNT; i++)
			
 
				 	{
			
 
				-		err = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
			
 
				-		assert(err == 0);
			
 
				+		ret = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				 		starpu_vector_data_register(&vector_handles[i], 0,
			
 
				 					    (uintptr_t)vectors[i],
			
 
				 					    VECTOR_SIZE, sizeof(float));
			
@@ -173,18 +181,18 @@ int main(int argc, char *argv[])
 
				 
			
 
				 	for(i = 0; i < VARIABLE_COUNT; i++)
			
 
				 	{
			
 
				-		starpu_data_handle handle;
			
 
				+		starpu_data_handle_t handle;
			
 
				 
			
 
				 		handle = starpu_data_lookup(variables[i]);
			
 
				-		assert(handle == variable_handles[i]);
			
 
				+		STARPU_ASSERT(handle == variable_handles[i]);
			
 
				 	}
			
 
				 
			
 
				 	for(i = 0; i < VECTOR_COUNT; i++)
			
 
				 	{
			
 
				-		starpu_data_handle handle;
			
 
				+		starpu_data_handle_t handle;
			
 
				 
			
 
				 		handle = starpu_data_lookup(vectors[i]);
			
 
				-		assert(handle == vector_handles[i]);
			
 
				+		STARPU_ASSERT(handle == vector_handles[i]);
			
 
				 	}
			
 
				 
			
 
				 	/* Unregister them.  */
			
@@ -203,24 +211,24 @@ int main(int argc, char *argv[])
 
				 
			
 
				 	for(i = 0; i < VARIABLE_COUNT; i++)
			
 
				 	{
			
 
				-		starpu_data_handle handle;
			
 
				+		starpu_data_handle_t handle;
			
 
				 
			
 
				 		handle = starpu_data_lookup(variables[i]);
			
 
				-		assert(handle == NULL);
			
 
				+		STARPU_ASSERT(handle == NULL);
			
 
				 		starpu_free(variables[i]);
			
 
				 	}
			
 
				 
			
 
				 	for(i = 0; i < VECTOR_COUNT; i++)
			
 
				 	{
			
 
				-		starpu_data_handle handle;
			
 
				+		starpu_data_handle_t handle;
			
 
				 
			
 
				 		handle = starpu_data_lookup(vectors[i]);
			
 
				-		assert(handle == NULL);
			
 
				+		STARPU_ASSERT(handle == NULL);
			
 
				 		starpu_free(vectors[i]);
			
 
				 	}
			
 
				 
			
 
				-	err = test_lazy_allocation();
			
 
				-	if (err == -ENODEV) goto enodev;
			
 
				+	ret = test_lazy_allocation();
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				 	test_filters();
			
 
				 
			
 
				 	starpu_shutdown();
			
@@ -231,5 +239,6 @@ enodev:
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 77;
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,28 +16,29 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 /* number of philosophers */
			
 
				 #define N	16
			
 
				 
			
 
				-starpu_data_handle fork_handles[N];
			
 
				+starpu_data_handle_t fork_handles[N];
			
 
				 unsigned forks[N];
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				-
			
 
				 static void eat_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet eating_cl = {
			
 
				+static struct starpu_codelet eating_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cuda_func = eat_kernel,
			
 
				-	.cpu_func = eat_kernel,
			
 
				-        .opencl_func = eat_kernel,
			
 
				+	.cuda_funcs = {eat_kernel, NULL},
			
 
				+	.cpu_funcs = {eat_kernel, NULL},
			
 
				+        .opencl_funcs = {eat_kernel, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				-void submit_one_task(unsigned p)
			
 
				+int submit_one_task(unsigned p)
			
 
				 {
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				 
			
@@ -46,18 +47,20 @@ void submit_one_task(unsigned p)
 
				 	unsigned left = p;
			
 
				 	unsigned right = (p+1)%N;
			
 
				 
			
 
				-	task->buffers[0].handle = fork_handles[left];
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-	task->buffers[1].handle = fork_handles[right];
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = fork_handles[left];
			
 
				+	task->handles[1] = fork_handles[right];
			
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				-	STARPU_ASSERT(!ret);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* initialize the forks */
			
 
				 	unsigned f;
			
@@ -75,10 +78,13 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		/* select one philosopher randomly */
			
 
				 		unsigned philosopher = rand() % N;
			
 
				-		submit_one_task(philosopher);
			
 
				+		ret = submit_one_task(philosopher);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				 	FPRINTF(stderr, "waiting done\n");
			
 
				 	for (f = 0; f < N; f++)
			
@@ -88,5 +94,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	for (f = 0; f < N; f++)
			
 
				+	{
			
 
				+		starpu_data_unregister(fork_handles[f]);
			
 
				+	}
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/double_parameter.c
+++ b/tests/datawizard/double_parameter.c
@@ -0,0 +1,174 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet codelet_R_R =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_R_W =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_R_RW =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_W_R =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_W_W =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_W_RW =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_RW_R =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_RW_W =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet_RW_RW =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { dummy_func, NULL },
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	float foo = 0.0f;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_variable_data_register(&handle, 0, (uintptr_t)&foo, sizeof(foo));
			
 
				+
			
 
				+#define SUBMIT(mode0, mode1) \
			
 
				+	{ \
			
 
				+		task = starpu_task_create();	\
			
 
				+		task->handles[0] = handle;	\
			
 
				+		task->handles[1] = handle;		 \
			
 
				+		enum starpu_access_mode smode0 = STARPU_##mode0;	\
			
 
				+		enum starpu_access_mode smode1 = STARPU_##mode0;	\
			
 
				+		if      (smode0 == STARPU_R && smode1 == STARPU_R)	\
			
 
				+			task->cl = &codelet_R_R;			\
			
 
				+		else if (smode0 == STARPU_R && smode1 == STARPU_W)	\
			
 
				+			task->cl = &codelet_R_W;			\
			
 
				+		else if (smode0 == STARPU_R && smode1 == STARPU_RW)	\
			
 
				+			task->cl = &codelet_R_RW;			\
			
 
				+		else if (smode0 == STARPU_W && smode1 == STARPU_R)	\
			
 
				+			task->cl = &codelet_W_R;			\
			
 
				+		else if (smode0 == STARPU_W && smode1 == STARPU_W)	\
			
 
				+			task->cl = &codelet_W_W;			\
			
 
				+		else if (smode0 == STARPU_W && smode1 == STARPU_RW)	\
			
 
				+			task->cl = &codelet_W_RW;			\
			
 
				+		else if (smode0 == STARPU_RW && smode1 == STARPU_R)	\
			
 
				+			task->cl = &codelet_RW_R;			\
			
 
				+		else if (smode0 == STARPU_RW && smode1 == STARPU_W)	\
			
 
				+			task->cl = &codelet_RW_W;			\
			
 
				+		else if (smode0 == STARPU_RW && smode1 == STARPU_RW)	\
			
 
				+			task->cl = &codelet_RW_RW;			\
			
 
				+									\
			
 
				+		ret = starpu_task_submit(task);				\
			
 
				+		if (ret == -ENODEV) goto enodev;			\
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");   \
			
 
				+	}
			
 
				+
			
 
				+	SUBMIT(R,R);
			
 
				+	SUBMIT(R,W);
			
 
				+	SUBMIT(R,RW);
			
 
				+	SUBMIT(W,R);
			
 
				+	SUBMIT(W,W);
			
 
				+	SUBMIT(W,RW);
			
 
				+	SUBMIT(RW,R);
			
 
				+	SUBMIT(RW,W);
			
 
				+	SUBMIT(RW,RW);
			
 
				+
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+        return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/dsm_stress.c
+++ b/tests/datawizard/dsm_stress.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,6 +21,7 @@
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <pthread.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #define N	10000
			
 
				 
			
@@ -33,7 +34,7 @@ static unsigned finished = 0;
 
				 
			
 
				 static unsigned cnt = N;
			
 
				 
			
 
				-starpu_data_handle v_handle, v_handle2;
			
 
				+starpu_data_handle_t v_handle, v_handle2;
			
 
				 static unsigned *v;
			
 
				 static unsigned *v2;
			
 
				 
			
@@ -43,10 +44,10 @@ static void callback(void *arg)
 
				 
			
 
				 	if (res == 0)
			
 
				 	{
			
 
				-		pthread_mutex_lock(&mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 		finished = 1;
			
 
				-		pthread_cond_signal(&cond);
			
 
				-		pthread_mutex_unlock(&mutex);
			
 
				+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -64,11 +65,12 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 
				 {
			
 
				 }
			
 
				 
			
 
				-static starpu_access_mode select_random_mode(void)
			
 
				+static enum starpu_access_mode select_random_mode(void)
			
 
				 {
			
 
				 	int r = rand();
			
 
				 
			
 
				-	switch (r % 3) {
			
 
				+	switch (r % 3)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			return STARPU_R;
			
 
				 		case 1:
			
@@ -79,22 +81,109 @@ static starpu_access_mode select_random_mode(void)
 
				 	return STARPU_RW;
			
 
				 }
			
 
				 
			
 
				+static struct starpu_codelet cl_r_r =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_r_w =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_r_rw =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_w_r =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_w_w =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_w_rw =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_RW}
			
 
				+};
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl_rw_r =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_func = cpu_codelet_null,
			
 
				-	.cuda_func = cuda_codelet_null,
			
 
				-        .opencl_func = opencl_codelet_null,
			
 
				-	.nbuffers = 2
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_rw_w =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_rw_rw =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+	.cpu_funcs = {cpu_codelet_null, NULL},
			
 
				+	.cuda_funcs = {cuda_codelet_null, NULL},
			
 
				+        .opencl_funcs = {opencl_codelet_null, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW}
			
 
				 };
			
 
				 
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				-	starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
			
 
				+	ret = starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				+	ret = starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				 
			
 
				 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				 	starpu_vector_data_register(&v_handle2, 0, (uintptr_t)v2, VECTORSIZE, sizeof(unsigned));
			
@@ -103,36 +192,61 @@ int main(int argc, char **argv)
 
				 	for (iter = 0; iter < N; iter++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-		task->cl = &cl;
			
 
				 
			
 
				-		task->buffers[0].handle = v_handle;
			
 
				-		task->buffers[0].mode = select_random_mode();
			
 
				-
			
 
				-		task->buffers[1].handle = v_handle2;
			
 
				-		task->buffers[1].mode = select_random_mode();
			
 
				+		task->handles[0] = v_handle;
			
 
				+		task->handles[1] = v_handle2;
			
 
				+
			
 
				+		enum starpu_access_mode mode0 = select_random_mode();
			
 
				+		enum starpu_access_mode mode1 = select_random_mode();
			
 
				+
			
 
				+		if (mode0 == STARPU_R && mode1 == STARPU_R)
			
 
				+			task->cl = &cl_r_r;
			
 
				+		else if (mode0 == STARPU_R && mode1 == STARPU_W)
			
 
				+			task->cl = &cl_r_w;
			
 
				+		else if (mode0 == STARPU_R && mode1 == STARPU_RW)
			
 
				+			task->cl = &cl_r_rw;
			
 
				+		else if (mode0 == STARPU_W && mode1 == STARPU_R)
			
 
				+			task->cl = &cl_w_r;
			
 
				+		else if (mode0 == STARPU_W && mode1 == STARPU_W)
			
 
				+			task->cl = &cl_w_w;
			
 
				+		else if (mode0 == STARPU_W && mode1 == STARPU_RW)
			
 
				+			task->cl = &cl_w_rw;
			
 
				+		else if (mode0 == STARPU_RW && mode1 == STARPU_R)
			
 
				+			task->cl = &cl_rw_r;
			
 
				+		else if (mode0 == STARPU_RW && mode1 == STARPU_W)
			
 
				+			task->cl = &cl_rw_w;
			
 
				+		else if (mode0 == STARPU_RW && mode1 == STARPU_RW)
			
 
				+			task->cl = &cl_rw_rw;
			
 
				 
			
 
				 		task->callback_func = callback;
			
 
				 		task->callback_arg = NULL;
			
 
				 
			
 
				 		int ret = starpu_task_submit(task);
			
 
				-		if (ret == -ENODEV)
			
 
				-			goto enodev;
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
 
				-	pthread_mutex_lock(&mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 	if (!finished)
			
 
				-		pthread_cond_wait(&cond, &mutex);
			
 
				-	pthread_mutex_unlock(&mutex);
			
 
				+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_data_unregister(v_handle2);
			
 
				 	starpu_free(v);
			
 
				 	starpu_free(v2);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 
			
 
				 enodev:
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_data_unregister(v_handle2);
			
 
				+	starpu_free(v);
			
 
				+	starpu_free(v2);
			
 
				+	starpu_shutdown();
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 77;
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -0,0 +1,139 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "../helper.h"
			
 
				+#include "scal.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#if CUDART_VERSION >= 4000
			
 
				+	unsigned *foo_gpu;
			
 
				+	unsigned *foo;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int n, i, size, pieces;
			
 
				+	int devid;
			
 
				+	unsigned workerid;
			
 
				+	int chosen = -1;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+#endif
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
			
 
				+	/* TODO OpenCL, too */
			
 
				+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
			
 
				+		if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER) {
			
 
				+			chosen = workerid;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (chosen == -1)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+
			
 
				+	n = starpu_worker_get_count();
			
 
				+	size = 10 * n;
			
 
				+
			
 
				+	devid = starpu_worker_get_devid(chosen);
			
 
				+	cudaSetDevice(devid);
			
 
				+	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
			
 
				+
			
 
				+	foo = calloc(size, sizeof(*foo));
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		foo[i] = i;
			
 
				+
			
 
				+	cures = cudaMemcpy(foo_gpu, foo, size * sizeof(*foo_gpu), cudaMemcpyHostToDevice);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, starpu_worker_get_memory_node(chosen), (uintptr_t)foo_gpu, size, sizeof(*foo_gpu));
			
 
				+
			
 
				+	/* Broadcast the data to force in-place partitioning */
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
			
 
				+
			
 
				+	/* Even with just one worker, split in at least two */
			
 
				+	if (n == 1)
			
 
				+		pieces = 2;
			
 
				+	else
			
 
				+		pieces = n;
			
 
				+
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.nchildren = pieces,
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_partition(handle, &f);
			
 
				+
			
 
				+	for (i = 0; i < pieces; i++) {
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
			
 
				+		task->cl = &scal_codelet;
			
 
				+		task->execute_on_a_specific_worker = 1;
			
 
				+		task->workerid = i%n;
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	cudaSetDevice(devid);
			
 
				+	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	for (i = 0; i < size; i++) {
			
 
				+		if (foo[i] != i*2) {
			
 
				+			fprintf(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
			
 
				+			return EXIT_FAILURE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+        return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+#endif
			
 
				+#endif
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/handle_to_pointer.c
+++ b/tests/datawizard/handle_to_pointer.c
@@ -19,6 +19,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 static void cpu_task(void **buffers, void *args)
			
 
				 {
			
@@ -27,7 +28,7 @@ static void cpu_task(void **buffers, void *args)
 
				 	size_t size;
			
 
				 
			
 
				 	numbers = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-	starpu_unpack_cl_args (args, &size);
			
 
				+	starpu_codelet_unpack_args (args, &size);
			
 
				 
			
 
				 	for(i = 0; i < size; i++)
			
 
				 	{
			
@@ -43,7 +44,7 @@ static void cuda_task(void **buffers, void *args)
 
				 	size_t size;
			
 
				 
			
 
				 	numbers = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-	starpu_unpack_cl_args (args, &size);
			
 
				+	starpu_codelet_unpack_args (args, &size);
			
 
				 
			
 
				 	for(i = 0; i < size; i++)
			
 
				 	{
			
@@ -52,41 +53,44 @@ static void cuda_task(void **buffers, void *args)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU | STARPU_CUDA,
			
 
				-	.cpu_func = cpu_task,
			
 
				+	.cpu_funcs = {cpu_task, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = cuda_task,
			
 
				+	.cuda_funcs = {cuda_task, NULL},
			
 
				 #endif
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char *argv[])
			
 
				 {
			
 
				-	int err;
			
 
				+	int err, ret;
			
 
				 	size_t i;
			
 
				 	int *pointer;
			
 
				-	starpu_data_handle handle;
			
 
				+	starpu_data_handle_t handle;
			
 
				 	static const size_t count = 123;
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 
			
 
				 	err = starpu_malloc((void **)&pointer, count * sizeof(int));
			
 
				-	assert((err == 0) && (pointer != NULL));
			
 
				+	STARPU_ASSERT((err == 0) && (pointer != NULL));
			
 
				 
			
 
				 	starpu_variable_data_register(&handle, 0, (uintptr_t)pointer,
			
 
				 				      sizeof(int));
			
 
				-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				 	starpu_data_unregister(handle);
			
 
				 
			
 
				 	starpu_vector_data_register(&handle, 0, (uintptr_t)pointer,
			
 
				 				    count, sizeof(int));
			
 
				-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				 	starpu_data_unregister(handle);
			
 
				 
			
 
				 	starpu_matrix_data_register(&handle, 0, (uintptr_t)pointer, 0,
			
 
				 				    count, 1, sizeof(int));
			
 
				-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
			
 
				 	starpu_data_unregister(handle);
			
 
				 
			
 
				 	starpu_free(pointer);
			
@@ -95,24 +99,26 @@ int main(int argc, char *argv[])
 
				 	/* Lazy allocation.  */
			
 
				 	starpu_vector_data_register(&handle, -1, 0 /* NULL */,
			
 
				 				    count, sizeof(int));
			
 
				-	assert(starpu_handle_to_pointer(handle, 0) == NULL);
			
 
				+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == NULL);
			
 
				 
			
 
				 	/* Pass the handle to a task.  */
			
 
				-	starpu_insert_task(&cl,
			
 
				+	err = starpu_insert_task(&cl,
			
 
				 			   STARPU_W, handle,
			
 
				 			   STARPU_VALUE, &count, sizeof(count),
			
 
				 			   0);
			
 
				+	if (err == -ENODEV)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				 
			
 
				 	/* Acquire the handle, forcing a local allocation.  */
			
 
				 	starpu_data_acquire(handle, STARPU_R);
			
 
				 
			
 
				 	/* Make sure we have a local pointer to it.  */
			
 
				 	pointer = (int *) starpu_handle_to_pointer(handle, 0);
			
 
				-	assert(pointer != NULL);
			
 
				+	STARPU_ASSERT(pointer != NULL);
			
 
				 	for(i = 0; i < count; i++)
			
 
				 	{
			
 
				 		int *numbers = (int *)pointer;
			
 
				-		assert(numbers[i] == i);
			
 
				+		STARPU_ASSERT(numbers[i] == i);
			
 
				 	}
			
 
				 	starpu_data_release(handle);
			
 
				 
			
--- a/tests/datawizard/in_place_partition.c
+++ b/tests/datawizard/in_place_partition.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "../helper.h"
			
 
				+#include "scal.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned *foo;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				+	int n, i, size;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+
			
 
				+	n = starpu_worker_get_count();
			
 
				+	if (n == 1)
			
 
				+	{
			
 
				+		starpu_shutdown();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	size = 10 * n;
			
 
				+
			
 
				+	foo = (unsigned *) calloc(size, sizeof(*foo));
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		foo[i] = i;
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, 0, (uintptr_t)foo, size, sizeof(*foo));
			
 
				+
			
 
				+	/* Broadcast the data to force in-place partitioning */
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
			
 
				+
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.nchildren = n,
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_partition(handle, &f);
			
 
				+
			
 
				+	for (i = 0; i < f.nchildren; i++) {
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
			
 
				+		task->cl = &scal_codelet;
			
 
				+		task->execute_on_a_specific_worker = 1;
			
 
				+		task->workerid = i;
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+	starpu_data_unpartition(handle, 0);
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	ret = EXIT_SUCCESS;
			
 
				+	for (i = 0; i < size; i++) {
			
 
				+		if (foo[i] != i*2) {
			
 
				+			FPRINTF(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
			
 
				+			ret = EXIT_FAILURE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+        return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/increment_redux.c
+++ b/tests/datawizard/increment_redux.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,7 +15,9 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <starpu_cuda.h>
			
@@ -24,9 +26,10 @@
 
				 #include <starpu_opencl.h>
			
 
				 #endif
			
 
				 
			
 
				+#warning memory leak
			
 
				 
			
 
				 static unsigned var = 0;
			
 
				-static starpu_data_handle handle;
			
 
				+static starpu_data_handle_t handle;
			
 
				 
			
 
				 /*
			
 
				  *	Reduction methods
			
@@ -35,6 +38,8 @@ static starpu_data_handle handle;
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void redux_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
@@ -53,6 +58,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void neutral_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
@@ -65,6 +72,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static void redux_opencl_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned h_dst, h_src;
			
 
				 
			
 
				 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
@@ -79,18 +88,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
				 
			
 
				 	h_dst += h_src;
			
 
				 
			
 
				-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				 static void neutral_opencl_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned h_dst = 0;
			
 
				 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cl_command_queue queue;
			
 
				 	starpu_opencl_get_current_queue(&queue);
			
 
				 
			
 
				-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -98,6 +109,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void redux_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 	*dst = *dst + *src;
			
@@ -105,31 +118,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void neutral_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	*dst = 0;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet redux_cl = {
			
 
				+static struct starpu_codelet redux_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = redux_cuda_kernel,
			
 
				+	.cuda_funcs = {redux_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = redux_opencl_kernel,
			
 
				+	.opencl_funcs = {redux_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = redux_cpu_kernel,
			
 
				+	.cpu_funcs = {redux_cpu_kernel, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet neutral_cl = {
			
 
				+static struct starpu_codelet neutral_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = neutral_cuda_kernel,
			
 
				+	.cuda_funcs = {neutral_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = neutral_opencl_kernel,
			
 
				+	.opencl_funcs = {neutral_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = neutral_cpu_kernel,
			
 
				+	.cpu_funcs = {neutral_cpu_kernel, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
@@ -141,6 +158,8 @@ static starpu_codelet neutral_cl = {
 
				 /* dummy OpenCL implementation */
			
 
				 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned h_token;
			
 
				 
			
@@ -149,7 +168,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
				 
			
 
				 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				 	h_token++;
			
 
				-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -157,6 +176,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void increment_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned host_token;
			
 
				 
			
@@ -173,25 +194,33 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void increment_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	*tokenptr = *tokenptr + 1;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet increment_cl = {
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = increment_cuda_kernel,
			
 
				+	.cuda_funcs = {increment_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = increment_opencl_kernel,
			
 
				+	.opencl_funcs = {increment_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = increment_cpu_kernel,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {increment_cpu_kernel, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_REDUX}
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
			
 
				 
			
@@ -208,26 +237,44 @@ int main(int argc, char **argv)
 
				 		for (t = 0; t < ntasks; t++)
			
 
				 		{
			
 
				 			struct starpu_task *task = starpu_task_create();
			
 
				-	
			
 
				+
			
 
				 			task->cl = &increment_cl;
			
 
				-	
			
 
				-			task->buffers[0].mode = STARPU_REDUX;
			
 
				-			task->buffers[0].handle = handle;
			
 
				-	
			
 
				-			int ret = starpu_task_submit(task);
			
 
				-			STARPU_ASSERT(!ret);
			
 
				+			task->handles[0] = handle;
			
 
				 
			
 
				+			int ret = starpu_task_submit(task);
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 
			
 
				-		starpu_data_acquire(handle, STARPU_R);
			
 
				-		STARPU_ASSERT(var == ntasks*(loop + 1));
			
 
				+		ret = starpu_data_acquire(handle, STARPU_R);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+		if (var != ntasks * (loop+1))
			
 
				+		{
			
 
				+			starpu_data_release(handle);
			
 
				+			starpu_data_unregister(handle);
			
 
				+			goto err;
			
 
				+		}
			
 
				 		starpu_data_release(handle);
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(handle);
			
 
				-	STARPU_ASSERT(var == ntasks*nloops);
			
 
				-	
			
 
				+	if (var != ntasks * nloops)
			
 
				+		goto err;
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+err:
			
 
				 	starpu_shutdown();
			
 
				+	STARPU_RETURN(EXIT_FAILURE);
			
 
				 
			
 
				-	return 0;
			
 
				 }
			
--- a/tests/datawizard/increment_redux_lazy.c
+++ b/tests/datawizard/increment_redux_lazy.c
@@ -0,0 +1,255 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cuda.h>
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				+#warning memory leak
			
 
				+
			
 
				+static starpu_data_handle_t handle;
			
 
				+
			
 
				+/*
			
 
				+ *	Reduction methods
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void redux_cuda_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned host_dst, host_src;
			
 
				+
			
 
				+	/* This is a dummy technique of course */
			
 
				+	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				+	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				+	cudaThreadSynchronize();
			
 
				+
			
 
				+	host_dst += host_src;
			
 
				+
			
 
				+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				+	cudaThreadSynchronize();
			
 
				+}
			
 
				+
			
 
				+static void neutral_cuda_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+
			
 
				+	/* This is a dummy technique of course */
			
 
				+	unsigned host_dst = 0;
			
 
				+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				+	cudaThreadSynchronize();
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static void redux_opencl_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned h_dst, h_src;
			
 
				+
			
 
				+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	cl_mem d_src = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	cl_command_queue queue;
			
 
				+	starpu_opencl_get_current_queue(&queue);
			
 
				+
			
 
				+	/* This is a dummy technique of course */
			
 
				+	clEnqueueReadBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				+	clEnqueueReadBuffer(queue, d_src, CL_TRUE, 0, sizeof(unsigned), (void *)&h_src, 0, NULL, NULL);
			
 
				+
			
 
				+	h_dst += h_src;
			
 
				+
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				+static void neutral_opencl_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned h_dst = 0;
			
 
				+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+
			
 
				+	cl_command_queue queue;
			
 
				+	starpu_opencl_get_current_queue(&queue);
			
 
				+
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void redux_cpu_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	*dst = *dst + *src;
			
 
				+}
			
 
				+
			
 
				+static void neutral_cpu_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	*dst = 0;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet redux_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {redux_cuda_kernel, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {redux_opencl_kernel, NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {redux_cpu_kernel, NULL},
			
 
				+	.nbuffers = 2
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet neutral_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {neutral_cuda_kernel, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {neutral_opencl_kernel, NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {neutral_cpu_kernel, NULL},
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Increment codelet
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+/* dummy OpenCL implementation */
			
 
				+static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
			
 
				+{
			
 
				+	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned h_token;
			
 
				+
			
 
				+	cl_command_queue queue;
			
 
				+	starpu_opencl_get_current_queue(&queue);
			
 
				+
			
 
				+	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				+	h_token++;
			
 
				+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void increment_cuda_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned host_token;
			
 
				+
			
 
				+	/* This is a dummy technique of course */
			
 
				+	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				+	cudaThreadSynchronize();
			
 
				+
			
 
				+	host_token++;
			
 
				+
			
 
				+	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				+	cudaThreadSynchronize();
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void increment_cpu_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	*tokenptr = *tokenptr + 1;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda_kernel, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {increment_opencl_kernel, NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu_kernel, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_REDUX}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned *var;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_variable_data_register(&handle, -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+
			
 
				+	starpu_data_set_reduction_methods(handle, &redux_cl, &neutral_cl);
			
 
				+
			
 
				+	unsigned ntasks = 1024;
			
 
				+	unsigned nloops = 16;
			
 
				+
			
 
				+	unsigned loop;
			
 
				+	unsigned t;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		for (t = 0; t < ntasks; t++)
			
 
				+		{
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+			task->cl = &increment_cl;
			
 
				+			task->handles[0] = handle;
			
 
				+
			
 
				+			int ret = starpu_task_submit(task);
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		}
			
 
				+
			
 
				+		ret = starpu_data_acquire(handle, STARPU_R);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+		var = (unsigned*) starpu_variable_get_local_ptr(handle);
			
 
				+		STARPU_ASSERT(*var == ntasks*(loop + 1));
			
 
				+		starpu_data_release(handle);
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_data_acquire(handle, STARPU_R);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+	var = (unsigned*) starpu_variable_get_local_ptr(handle);
			
 
				+	STARPU_ASSERT(*var == ntasks*nloops);
			
 
				+	starpu_data_release(handle);
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	STARPU_RETURN(EXIT_SUCCESS);
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	STARPU_RETURN(STARPU_TEST_SKIPPED);
			
 
				+}
			
--- a/tests/datawizard/increment_redux_v2.c
+++ b/tests/datawizard/increment_redux_v2.c
@@ -14,7 +14,9 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <starpu_cuda.h>
			
@@ -23,9 +25,10 @@
 
				 #include <starpu_opencl.h>
			
 
				 #endif
			
 
				 
			
 
				+#warning memory leak
			
 
				 
			
 
				 static unsigned var = 0;
			
 
				-static starpu_data_handle handle;
			
 
				+static starpu_data_handle_t handle;
			
 
				 
			
 
				 /*
			
 
				  *	Reduction methods
			
@@ -34,6 +37,8 @@ static starpu_data_handle handle;
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void redux_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
@@ -52,6 +57,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void neutral_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
@@ -64,6 +71,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static void redux_opencl_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned h_dst, h_src;
			
 
				 
			
 
				 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
@@ -78,18 +87,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
				 
			
 
				 	h_dst += h_src;
			
 
				 
			
 
				-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				 static void neutral_opencl_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned h_dst = 0;
			
 
				 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cl_command_queue queue;
			
 
				 	starpu_opencl_get_current_queue(&queue);
			
 
				 
			
 
				-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -97,6 +108,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void redux_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 	*dst = *dst + *src;
			
@@ -104,31 +117,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void neutral_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	*dst = 0;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet redux_cl = {
			
 
				+static struct starpu_codelet redux_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = redux_cuda_kernel,
			
 
				+	.cuda_funcs = {redux_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = redux_opencl_kernel,
			
 
				+	.opencl_funcs = {redux_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = redux_cpu_kernel,
			
 
				+	.cpu_funcs = {redux_cpu_kernel, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet neutral_cl = {
			
 
				+static struct starpu_codelet neutral_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = neutral_cuda_kernel,
			
 
				+	.cuda_funcs = {neutral_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = neutral_opencl_kernel,
			
 
				+	.opencl_funcs = {neutral_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = neutral_cpu_kernel,
			
 
				+	.cpu_funcs = {neutral_cpu_kernel, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
@@ -140,6 +157,8 @@ static starpu_codelet neutral_cl = {
 
				 /* dummy OpenCL implementation */
			
 
				 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned h_token;
			
 
				 
			
@@ -148,7 +167,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
				 
			
 
				 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				 	h_token++;
			
 
				-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
			
 
				+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -156,6 +175,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static void increment_cuda_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	unsigned host_token;
			
 
				 
			
@@ -172,25 +193,47 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 static void increment_cpu_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	*tokenptr = *tokenptr + 1;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet increment_cl = {
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = increment_cuda_kernel,
			
 
				+	.cuda_funcs = {increment_cuda_kernel, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	.opencl_func = increment_opencl_kernel,
			
 
				+	.opencl_funcs = {increment_opencl_kernel, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = increment_cpu_kernel,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {increment_cpu_kernel, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet increment_cl_redux =
			
 
				+{
			
 
				+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda_kernel, NULL},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {increment_opencl_kernel, NULL},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu_kernel, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_REDUX}
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
			
 
				 
			
@@ -207,26 +250,55 @@ int main(int argc, char **argv)
 
				 		for (t = 0; t < ntasks; t++)
			
 
				 		{
			
 
				 			struct starpu_task *task = starpu_task_create();
			
 
				-	
			
 
				-			task->cl = &increment_cl;
			
 
				-	
			
 
				-			task->buffers[0].mode = (t % 10 == 0)?STARPU_RW:STARPU_REDUX;
			
 
				-			task->buffers[0].handle = handle;
			
 
				-	
			
 
				-			int ret = starpu_task_submit(task);
			
 
				-			STARPU_ASSERT(!ret);
			
 
				 
			
 
				+			if (t % 10 == 0)
			
 
				+			{
			
 
				+				task->cl = &increment_cl;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				task->cl = &increment_cl_redux;
			
 
				+			}
			
 
				+			task->handles[0] = handle;
			
 
				+
			
 
				+			int ret = starpu_task_submit(task);
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 
			
 
				-		starpu_data_acquire(handle, STARPU_R);
			
 
				-		STARPU_ASSERT(var == ntasks*(loop + 1));
			
 
				+		ret = starpu_data_acquire(handle, STARPU_R);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+		if (var != ntasks *(loop+1))
			
 
				+		{
			
 
				+			_STARPU_DEBUG("%d != %d\n", var, ntasks*(loop+1));
			
 
				+			starpu_data_release(handle);
			
 
				+			starpu_data_unregister(handle);
			
 
				+			goto err;
			
 
				+		}
			
 
				 		starpu_data_release(handle);
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(handle);
			
 
				-	STARPU_ASSERT(var == ntasks*nloops);
			
 
				+	if (var != ntasks *nloops)
			
 
				+	{
			
 
				+		_STARPU_DEBUG("%d != %d\n", var, ntasks*nloops);
			
 
				+		goto err;
			
 
				+	}
			
 
				 	
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unregister(handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+err:
			
 
				+	starpu_shutdown();
			
 
				+	STARPU_RETURN(EXIT_FAILURE);
			
 
				 }
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
+++ b/tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
@@ -0,0 +1,70 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+extern struct test_config bcsr_config;
			
 
				+
			
 
				+__global__ void bcsr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i >= nnz)
			
 
				+		return;
			
 
				+
			
 
				+	if (nzval[i] != i*factor)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		nzval[i] = -nzval[i];
			
 
				+}
			
 
				+
			
 
				+extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	int factor;
			
 
				+	int *ret;
			
 
				+	int *val;
			
 
				+	cudaError_t error;
			
 
				+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+	factor = *(int *) args;
			
 
				+	//val = (int *) starpu_bcsr_get_local_nzval((starpu_data_handle_t)buffers[0]);
			
 
				+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
			
 
				+
			
 
				+	error = cudaMalloc(&ret, sizeof(int));
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	error = cudaMemcpy(ret,
			
 
				+			   &bcsr_config.copy_failed,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyHostToDevice);
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+        bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
			
 
				+		(val, nnz, ret, factor);
			
 
				+
			
 
				+	error = cudaMemcpy(&bcsr_config.copy_failed,
			
 
				+			   ret,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyDeviceToHost);
			
 
				+	
			
 
				+	cudaFree(ret);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_interface.c
+++ b/tests/datawizard/interfaces/bcsr/bcsr_interface.c
@@ -0,0 +1,198 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+#include "../../../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * XXX : These values should not be changed. If you really understand all that
			
 
				+ * BCSR stuff, feel free to write a better example :)
			
 
				+ */
			
 
				+
			
 
				+/* Size of the matrix */
			
 
				+#define WIDTH          4
			
 
				+#define HEIGHT         4
			
 
				+#define SIZE           (WIDTH * HEIGHT)
			
 
				+
			
 
				+/* Size of the blocks */
			
 
				+#define R              2
			
 
				+#define C              2
			
 
				+#define BLOCK_SIZE     (R*C)
			
 
				+
			
 
				+/* The matrix is simply 0 1 2... There are SIZE-1 non zero values... */
			
 
				+#define NNZ            (SIZE-1)
			
 
				+
			
 
				+/* ... and SIZE/BLOCK_SIZE non zero blocks */
			
 
				+#define NNZ_BLOCKS     (SIZE/BLOCK_SIZE)
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+static void test_bcsr_cpu_func(void *buffers[], void *args);
			
 
				+#endif /* !STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void test_bcsr_cuda_func(void *buffers[], void *_args);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void test_bcsr_opencl_func(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static int nzval[NNZ];
			
 
				+static int nzval2[NNZ];
			
 
				+
			
 
				+static uint32_t colind[NNZ_BLOCKS];
			
 
				+static uint32_t colind2[NNZ_BLOCKS];
			
 
				+
			
 
				+static uint32_t rowptr[1+WIDTH/R];
			
 
				+static uint32_t rowptr2[1+WIDTH/R];
			
 
				+
			
 
				+static starpu_data_handle_t bcsr_handle;
			
 
				+static starpu_data_handle_t bcsr2_handle;
			
 
				+
			
 
				+
			
 
				+struct test_config bcsr_config =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	.cpu_func      = test_bcsr_cpu_func,
			
 
				+#endif /* !STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func     = test_bcsr_cuda_func,
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func   = test_bcsr_opencl_func,
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+	.handle        = &bcsr_handle,
			
 
				+	.dummy_handle  = &bcsr2_handle,
			
 
				+	.copy_failed   = 0,
			
 
				+	.name          = "bcsr_interface"
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+register_data(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < NNZ; i++)
			
 
				+		nzval[i] = i;
			
 
				+
			
 
				+	colind[0] = 0;
			
 
				+	colind[1] = 2;
			
 
				+	colind[2] = 0;
			
 
				+	colind[3] = 2;
			
 
				+
			
 
				+	rowptr[0] = 0;
			
 
				+	rowptr[1] = 2;
			
 
				+	rowptr[2] = 4;
			
 
				+	
			
 
				+	starpu_bcsr_data_register(&bcsr_handle,
			
 
				+				  0,
			
 
				+				  NNZ_BLOCKS,
			
 
				+				  HEIGHT/R,
			
 
				+				  (uintptr_t) nzval,
			
 
				+				  colind,
			
 
				+				  rowptr,
			
 
				+				  0,
			
 
				+				  R,
			
 
				+				  C,
			
 
				+				  sizeof(nzval[0]));
			
 
				+
			
 
				+	starpu_bcsr_data_register(&bcsr2_handle,
			
 
				+				  0,
			
 
				+				  NNZ_BLOCKS,
			
 
				+				  HEIGHT/R,
			
 
				+				  (uintptr_t) nzval2,
			
 
				+				  colind2,
			
 
				+				  rowptr2,
			
 
				+				  0,
			
 
				+				  R,
			
 
				+				  C,
			
 
				+				  sizeof(nzval2[0]));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(bcsr_handle);
			
 
				+	starpu_data_unregister(bcsr2_handle);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+test_bcsr_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *val;
			
 
				+	int factor;
			
 
				+	int i;
			
 
				+
			
 
				+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
			
 
				+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
			
 
				+	factor = *(int *) args;
			
 
				+
			
 
				+	for (i = 0; i < nnz; i++)
			
 
				+	{
			
 
				+		if (val[i] != i * factor)
			
 
				+		{
			
 
				+			bcsr_config.copy_failed = 1;
			
 
				+			return;
			
 
				+		}
			
 
				+		val[i] *= -1;
			
 
				+	}
			
 
				+
			
 
				+	/* Check colind */
			
 
				+	uint32_t *col = STARPU_BCSR_GET_COLIND(buffers[0]);
			
 
				+	for (i = 0; i < NNZ_BLOCKS; i++)
			
 
				+		if (col[i] != colind[i])
			
 
				+			bcsr_config.copy_failed = 1;
			
 
				+
			
 
				+	/* Check rowptr */
			
 
				+	uint32_t *row = STARPU_BCSR_GET_ROWPTR(buffers[0]);
			
 
				+	for (i = 0; i < 1 + WIDTH/R; i++)
			
 
				+		if (row[i] != rowptr[i])
			
 
				+			bcsr_config.copy_failed = 1;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	data_interface_test_summary *summary;
			
 
				+	struct starpu_conf conf =
			
 
				+	{
			
 
				+		.ncpus   = -1,
			
 
				+		.ncuda   = 2,
			
 
				+		.nopencl = 1
			
 
				+	};
			
 
				+
			
 
				+	if (starpu_init(&conf) == -ENODEV)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+	register_data();
			
 
				+
			
 
				+	summary = run_tests(&bcsr_config);
			
 
				+	if (!summary)
			
 
				+		exit(EXIT_FAILURE);
			
 
				+
			
 
				+	unregister_data();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	data_interface_test_summary_print(stderr, summary);
			
 
				+
			
 
				+	return data_interface_test_summary_success(summary);
			
 
				+}
			
 
				+
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_opencl.c
+++ b/tests/datawizard/interfaces/bcsr/bcsr_opencl.c
@@ -0,0 +1,130 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+#define KERNEL_LOCATION "tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl"
			
 
				+extern struct test_config bcsr_config;
			
 
				+static struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void
			
 
				+test_bcsr_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int id, devid, ret;
			
 
				+	int factor = *(int *) args;
			
 
				+
			
 
				+        cl_int             err;
			
 
				+	cl_kernel          kernel;
			
 
				+	cl_command_queue   queue;
			
 
				+	cl_event           event;
			
 
				+
			
 
				+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+
			
 
				+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
			
 
				+	cl_mem nzval = (cl_mem)STARPU_BCSR_GET_NZVAL(buffers[0]);
			
 
				+
			
 
				+	cl_context context;
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
			
 
				+		sizeof(int), &bcsr_config.copy_failed, &err);
			
 
				+
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"test_bcsr_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	int nargs;
			
 
				+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					      sizeof(nzval), &nzval,
			
 
				+					      sizeof(nnz), &nnz,
			
 
				+					      sizeof(fail), &fail,
			
 
				+					      sizeof(factor), &factor,
			
 
				+					      0);
			
 
				+
			
 
				+	if (nargs != 4)
			
 
				+	{
			
 
				+		fprintf(stderr, "Failed to set argument #%d\n", err);
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+			
			
 
				+	{
			
 
				+		size_t global = nnz;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					kernel,
			
 
				+					1,
			
 
				+					NULL,
			
 
				+					&global,
			
 
				+					&local,
			
 
				+					0,
			
 
				+					NULL,
			
 
				+					&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	err = clEnqueueReadBuffer(queue,
			
 
				+				  fail,
			
 
				+				  CL_TRUE,
			
 
				+				  0, 
			
 
				+				  sizeof(int),
			
 
				+				  &bcsr_config.copy_failed,
			
 
				+				  0,
			
 
				+				  NULL,
			
 
				+				  NULL);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+        starpu_opencl_unload_opencl(&opencl_program);
			
 
				+}
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
@@ -0,0 +1,29 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+__kernel void test_bcsr_opencl(__global int *val,
			
 
				+			       unsigned int nx,
			
 
				+			       __global int *err,
			
 
				+			       int factor)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i >=  nx)
			
 
				+		return;
			
 
				+
			
 
				+	if (val[i] != i * factor)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		val[i] = - val[i];
			
 
				+}
			
--- a/tests/datawizard/interfaces/block/block_cuda.cu
+++ b/tests/datawizard/interfaces/block/block_cuda.cu
@@ -0,0 +1,80 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+extern struct test_config block_config;
			
 
				+
			
 
				+static __global__ void block_cuda(int *block,
			
 
				+				  int nx, int ny, int nz,
			
 
				+				  unsigned ldy, unsigned ldz,
			
 
				+				  float factor, int *err)
			
 
				+{
			
 
				+        int i, j, k;
			
 
				+	int val = 0;
			
 
				+
			
 
				+        for (k = 0; k < nz ;k++)
			
 
				+	{
			
 
				+                for (j = 0; j < ny ;j++)
			
 
				+		{
			
 
				+                        for(i = 0; i < nx ;i++)
			
 
				+			{
			
 
				+				if (block[(k*ldz)+(j*ldy)+i] != factor * val)
			
 
				+				{
			
 
				+					*err = 1;
			
 
				+					return;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					block[(k*ldz)+(j*ldy)+i] *= -1;
			
 
				+					val++;
			
 
				+				}
			
 
				+			}
			
 
				+                }
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+extern "C" void test_block_cuda_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	cudaError_t error;
			
 
				+	int *ret;
			
 
				+
			
 
				+	error = cudaMalloc(&ret, sizeof(int));
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	error = cudaMemcpy(ret, &block_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice);
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
			
 
				+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
			
 
				+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				+	int factor = *(int*) args;
			
 
				+
			
 
				+        block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
			
 
				+		(block, nx, ny, nz, ldy, ldz, factor, ret);
			
 
				+	error = cudaMemcpy(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost);
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	cudaFree(ret);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/tests/datawizard/interfaces/block/block_interface.c
+++ b/tests/datawizard/interfaces/block/block_interface.c
@@ -0,0 +1,163 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+#include "../../../helper.h"
			
 
				+
			
 
				+#define NX 16
			
 
				+#define NY NX
			
 
				+#define NZ NX
			
 
				+
			
 
				+/* Prototypes */
			
 
				+static void register_data(void);
			
 
				+static void unregister_data(void);
			
 
				+static void test_block_cpu_func(void *buffers[], void *args);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void test_block_cuda_func(void *buffers[], void *_args);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void test_block_opencl_func(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static starpu_data_handle_t block_handle;
			
 
				+static starpu_data_handle_t block2_handle;
			
 
				+
			
 
				+struct test_config block_config =
			
 
				+{
			
 
				+	.cpu_func      = test_block_cpu_func,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func     = test_block_cuda_func,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func   = test_block_opencl_func,
			
 
				+#endif
			
 
				+	.handle        = &block_handle,
			
 
				+	.dummy_handle  = &block2_handle,
			
 
				+	.copy_failed   = 0,
			
 
				+	.name          = "block_interface"
			
 
				+};
			
 
				+
			
 
				+static int block[NX*NY*NZ];
			
 
				+static int block2[NX*NY*NZ];
			
 
				+
			
 
				+static void
			
 
				+register_data(void)
			
 
				+{
			
 
				+	/* Initializing data */
			
 
				+	int val = 0;
			
 
				+	int i, j, k;
			
 
				+	for (k = 0; k < NZ; k++)
			
 
				+		for (j = 0; j < NY; j++)
			
 
				+			for (i = 0; i < NX; i++)
			
 
				+                                block[(k*NX*NY)+(j*NX)+i] = val++;
			
 
				+
			
 
				+	/* Registering data */
			
 
				+	starpu_block_data_register(&block_handle,
			
 
				+                                    0,
			
 
				+                                    (uintptr_t)block,
			
 
				+				    NX,
			
 
				+				    NX * NY,
			
 
				+				    NX,
			
 
				+				    NY,
			
 
				+				    NZ,
			
 
				+				    sizeof(block[0]));
			
 
				+	starpu_block_data_register(&block2_handle,
			
 
				+                                    0,
			
 
				+                                    (uintptr_t)block2,
			
 
				+				    NX,
			
 
				+				    NX * NY,
			
 
				+				    NX,
			
 
				+				    NY,
			
 
				+				    NZ,
			
 
				+				    sizeof(block2[0]));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(block_handle);
			
 
				+	starpu_data_unregister(block2_handle);
			
 
				+}
			
 
				+
			
 
				+static void test_block_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int factor = *(int*)args;
			
 
				+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
			
 
				+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
			
 
				+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				+	unsigned int i, j, k;
			
 
				+	int val = 0;
			
 
				+	block_config.copy_failed = 0;
			
 
				+	for (k = 0; k < nz; k++)
			
 
				+	{
			
 
				+		for (j = 0; j < ny; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < nx; i++)
			
 
				+			{
			
 
				+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
			
 
				+				{
			
 
				+					block_config.copy_failed = 1;
			
 
				+					return;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					block[(k*ldz)+(j*ldy)+i] *= -1;
			
 
				+					val++;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	data_interface_test_summary *summary;
			
 
				+	struct starpu_conf conf =
			
 
				+	{
			
 
				+		.ncpus   = -1,
			
 
				+		.ncuda   = 2,
			
 
				+		.nopencl = 1
			
 
				+	};
			
 
				+
			
 
				+	if (starpu_init(&conf) == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+	register_data();
			
 
				+
			
 
				+	summary = run_tests(&block_config);
			
 
				+	if (!summary)
			
 
				+		exit(EXIT_FAILURE);
			
 
				+
			
 
				+	unregister_data();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	data_interface_test_summary_print(stderr, summary);
			
 
				+
			
 
				+	return data_interface_test_summary_success(summary);
			
 
				+
			
 
				+enodev:
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+
			
--- a/tests/datawizard/interfaces/block/block_opencl.c
+++ b/tests/datawizard/interfaces/block/block_opencl.c
@@ -0,0 +1,120 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+#define KERNEL_LOCATION "tests/datawizard/interfaces/block/block_opencl_kernel.cl"
			
 
				+extern struct test_config block_config;
			
 
				+static struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void
			
 
				+test_block_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int id, devid, ret;
			
 
				+	int factor = *(int *) args;
			
 
				+
			
 
				+        cl_int             err;
			
 
				+	cl_kernel          kernel;
			
 
				+	cl_command_queue   queue;
			
 
				+	cl_event           event;
			
 
				+
			
 
				+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+
			
 
				+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
			
 
				+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
			
 
				+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				+	cl_mem block = (cl_mem) STARPU_BLOCK_GET_DEV_HANDLE(buffers[0]);
			
 
				+
			
 
				+	cl_context context;
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
			
 
				+		sizeof(int), &block_config.copy_failed, &err);
			
 
				+
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"block_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	int nargs;
			
 
				+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					      sizeof(block), &block,
			
 
				+					      sizeof(nx), &nx,
			
 
				+					      sizeof(ny), &ny,
			
 
				+					      sizeof(nz), &nz,
			
 
				+					      sizeof(ldy), &ldy,
			
 
				+					      sizeof(ldz), &ldz,
			
 
				+					      sizeof(factor), &factor,
			
 
				+					      sizeof(fail), &fail,
			
 
				+					      0);
			
 
				+
			
 
				+	if (nargs != 8)
			
 
				+	{
			
 
				+		fprintf(stderr, "Failed to set argument #%d\n", nargs);
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+			
			
 
				+	{
			
 
				+		size_t global = nx * ny * nz;
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					     kernel,
			
 
				+					     1,
			
 
				+					     NULL,
			
 
				+					     &global,
			
 
				+					     NULL,
			
 
				+					     0,
			
 
				+					     NULL,
			
 
				+					     &event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	err = clEnqueueReadBuffer(queue,
			
 
				+				  fail,
			
 
				+				  CL_TRUE,
			
 
				+				  0, 
			
 
				+				  sizeof(int),
			
 
				+				  &block_config.copy_failed,
			
 
				+				  0,
			
 
				+				  NULL,
			
 
				+				  NULL);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+        starpu_opencl_unload_opencl(&opencl_program);
			
 
				+}
			
--- a/tests/datawizard/interfaces/block/block_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/block/block_opencl_kernel.cl
@@ -0,0 +1,46 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+__kernel void block_opencl(__global int *block,
			
 
				+			   int nx, int ny, int nz,
			
 
				+			   int ldy, int ldz,
			
 
				+			   int factor, __global int *err)
			
 
				+{
			
 
				+        const int id = get_global_id(0);
			
 
				+	if (id > 0)
			
 
				+		return;
			
 
				+
			
 
				+	unsigned int i, j, k;
			
 
				+	int val = 0;
			
 
				+	for (k = 0; k < nz; k++)
			
 
				+	{
			
 
				+		for (j = 0; j < ny; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < nx; i++)
			
 
				+			{
			
 
				+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
			
 
				+				{
			
 
				+					*err = 1;
			
 
				+					return;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					block[(k*ldz)+(j*ldy)+i] *= -1;
			
 
				+					val++;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/tests/datawizard/interfaces/copy_interfaces.c
+++ b/tests/datawizard/interfaces/copy_interfaces.c
@@ -0,0 +1,106 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../../helper.h"
			
 
				+#include <datawizard/coherency.h>
			
 
				+
			
 
				+static int check_copy(starpu_data_handle_t handle, char *header)
			
 
				+{
			
 
				+	void *old_interface, *new_interface;
			
 
				+	starpu_data_handle_t new_handle;
			
 
				+	int ret=0;
			
 
				+
			
 
				+	starpu_data_register_same(&new_handle, handle);
			
 
				+
			
 
				+	if (!getenv("STARPU_SSILENT") && new_handle->ops->display)
			
 
				+	{
			
 
				+		fprintf(stderr, "%s: ", header);
			
 
				+		new_handle->ops->display(new_handle, stderr);
			
 
				+		fprintf(stderr, "\n");
			
 
				+	}
			
 
				+
			
 
				+	old_interface = starpu_data_get_interface_on_node(handle, 0);
			
 
				+	new_interface = starpu_data_get_interface_on_node(new_handle, 0);
			
 
				+
			
 
				+	if (new_handle->ops->compare(old_interface, new_interface) == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Error when copying %s data\n", header);
			
 
				+		assert(0);
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(new_handle);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t handle;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	{
			
 
				+		int x=42;
			
 
				+		starpu_variable_data_register(&handle, 0, (uintptr_t)&x, sizeof(x));
			
 
				+		ret = check_copy(handle, "variable");
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 0)
			
 
				+	{
			
 
				+		int xx[] = {12, 23, 45};
			
 
				+		starpu_vector_data_register(&handle, 0, (uintptr_t)xx, 3, sizeof(xx[0]));
			
 
				+		ret = check_copy(handle, "vector");
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 0)
			
 
				+	{
			
 
				+		int NX=3;
			
 
				+		int NY=2;
			
 
				+		int matrix[NX][NY];
			
 
				+		starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
			
 
				+		ret = check_copy(handle, "matrix");
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 0)
			
 
				+	{
			
 
				+		int NX=3;
			
 
				+		int NY=2;
			
 
				+		int NZ=4;
			
 
				+		int block[NX*NY*NZ];
			
 
				+		starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(block[0]));
			
 
				+		ret = check_copy(handle, "block");
			
 
				+	}
			
 
				+
			
 
				+	if (ret == 0)
			
 
				+	{
			
 
				+		uint32_t nnz = 2;
			
 
				+		unsigned nrow = 5;
			
 
				+		float nzvalA[20];
			
 
				+		uint32_t colind[1];
			
 
				+		uint32_t rowptr[2];
			
 
				+		starpu_csr_data_register(&handle, 0, nnz, nrow, (uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
			
 
				+		ret = check_copy(handle, "csr");
			
 
				+	}
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
--- a/tests/datawizard/interfaces/csr/csr_cuda.cu
+++ b/tests/datawizard/interfaces/csr/csr_cuda.cu
@@ -0,0 +1,68 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+extern struct test_config csr_config;
			
 
				+
			
 
				+__global__ void csr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i >= nnz)
			
 
				+		return;
			
 
				+
			
 
				+	if (nzval[i] != (i+1)*factor)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		nzval[i] = -nzval[i];
			
 
				+}
			
 
				+
			
 
				+extern "C" void test_csr_cuda_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	int factor;
			
 
				+	int *ret;
			
 
				+	int *val;
			
 
				+	cudaError_t error;
			
 
				+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+	factor = *(int *) args;
			
 
				+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
			
 
				+
			
 
				+	error = cudaMalloc(&ret, sizeof(int));
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	error = cudaMemcpy(ret,
			
 
				+			   &csr_config.copy_failed,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyHostToDevice);
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+        csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
			
 
				+
			
 
				+	error = cudaMemcpy(&csr_config.copy_failed,
			
 
				+			   ret,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyDeviceToHost);
			
 
				+	
			
 
				+	cudaFree(ret);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/tests/datawizard/interfaces/csr/csr_interface.c
+++ b/tests/datawizard/interfaces/csr/csr_interface.c
@@ -0,0 +1,170 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+#include "../../../helper.h"
			
 
				+
			
 
				+#define WIDTH  8
			
 
				+#define HEIGHT 4
			
 
				+#define SIZE   (WIDTH * HEIGHT)
			
 
				+#define NNZ    (SIZE-1)
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+static void test_csr_cpu_func(void *buffers[], void *args);
			
 
				+#endif /* !STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void test_csr_cuda_func(void *buffers[], void *_args);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void test_csr_opencl_func(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static int nzval[NNZ];
			
 
				+static int nzval2[NNZ];
			
 
				+
			
 
				+static uint32_t colind[NNZ];
			
 
				+static uint32_t colind2[NNZ];
			
 
				+
			
 
				+static uint32_t rowptr[HEIGHT+1];
			
 
				+static uint32_t rowptr2[HEIGHT+1];
			
 
				+
			
 
				+static starpu_data_handle_t csr_handle;
			
 
				+static starpu_data_handle_t csr2_handle;
			
 
				+
			
 
				+struct test_config csr_config =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	.cpu_func      = test_csr_cpu_func,
			
 
				+#endif /* ! STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func     = test_csr_cuda_func,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func   = test_csr_opencl_func,
			
 
				+#endif
			
 
				+	.handle        = &csr_handle,
			
 
				+	.dummy_handle  = &csr2_handle,
			
 
				+	.copy_failed   = 0,
			
 
				+	.name          = "csr_interface"
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+register_data(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < SIZE; i++)
			
 
				+	{
			
 
				+		nzval[i-1] = i;
			
 
				+		nzval2[i-1] = 42;
			
 
				+
			
 
				+		colind[i-1] = i % WIDTH;
			
 
				+		colind2[i-1] = colind[i];
			
 
				+	}
			
 
				+
			
 
				+	rowptr[0] = 1;
			
 
				+	rowptr2[0] = 1;
			
 
				+	for (i = 1; i < HEIGHT; i++)
			
 
				+	{
			
 
				+		rowptr[i] = i * WIDTH;
			
 
				+		rowptr2[i] = rowptr[i];
			
 
				+	}
			
 
				+	rowptr[HEIGHT] = NNZ + 1;
			
 
				+	rowptr2[HEIGHT] = rowptr[HEIGHT];
			
 
				+
			
 
				+	starpu_csr_data_register(&csr_handle,
			
 
				+				 0,
			
 
				+				 NNZ,
			
 
				+				 HEIGHT,
			
 
				+				 (uintptr_t) nzval,
			
 
				+				 colind,
			
 
				+				 rowptr,
			
 
				+				 0,
			
 
				+				 sizeof(nzval[0]));
			
 
				+	starpu_csr_data_register(&csr2_handle,
			
 
				+				 0,
			
 
				+				 NNZ,
			
 
				+				 HEIGHT,
			
 
				+				 (uintptr_t) nzval2,
			
 
				+				 colind2,
			
 
				+				 rowptr2,
			
 
				+				 0,
			
 
				+				 sizeof(nzval2[0]));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(csr_handle);
			
 
				+	starpu_data_unregister(csr2_handle);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+test_csr_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *val;
			
 
				+	int factor;
			
 
				+	int i;
			
 
				+
			
 
				+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
			
 
				+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
			
 
				+	factor = *(int *) args;
			
 
				+
			
 
				+	for (i = 0; i < nnz; i++)
			
 
				+	{
			
 
				+		if (val[i] != (i+1) * factor)
			
 
				+		{
			
 
				+			csr_config.copy_failed = 1;
			
 
				+			return;
			
 
				+		}
			
 
				+		val[i] *= -1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	data_interface_test_summary *summary;
			
 
				+	struct starpu_conf conf =
			
 
				+	{
			
 
				+		.ncpus   = -1,
			
 
				+		.ncuda   = 2,
			
 
				+		.nopencl = 1
			
 
				+	};
			
 
				+
			
 
				+	if (starpu_init(&conf) == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+	register_data();
			
 
				+
			
 
				+	summary = run_tests(&csr_config);
			
 
				+	if (!summary)
			
 
				+		exit(EXIT_FAILURE);
			
 
				+
			
 
				+	unregister_data();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	data_interface_test_summary_print(stderr, summary);
			
 
				+
			
 
				+	return data_interface_test_summary_success(summary);
			
 
				+
			
 
				+enodev:
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/interfaces/csr/csr_opencl.c
+++ b/tests/datawizard/interfaces/csr/csr_opencl.c
@@ -0,0 +1,130 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+#define KERNEL_LOCATION "tests/datawizard/interfaces/csr/csr_opencl_kernel.cl"
			
 
				+extern struct test_config csr_config;
			
 
				+static struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void
			
 
				+test_csr_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int id, devid, ret;
			
 
				+	int factor = *(int *) args;
			
 
				+
			
 
				+        cl_int             err;
			
 
				+	cl_kernel          kernel;
			
 
				+	cl_command_queue   queue;
			
 
				+	cl_event           event;
			
 
				+
			
 
				+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+
			
 
				+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
			
 
				+	cl_mem nzval = (cl_mem)STARPU_CSR_GET_NZVAL(buffers[0]);
			
 
				+
			
 
				+	cl_context context;
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
			
 
				+		sizeof(int), &csr_config.copy_failed, &err);
			
 
				+
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"test_csr_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	int nargs;
			
 
				+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					      sizeof(nzval), &nzval,
			
 
				+					      sizeof(nnz), &nnz,
			
 
				+					      sizeof(fail), &fail,
			
 
				+					      sizeof(factor), &factor,
			
 
				+					      0);
			
 
				+
			
 
				+	if (nargs != 4)
			
 
				+	{
			
 
				+		fprintf(stderr, "Failed to set argument #%d\n", err);
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+			
			
 
				+	{
			
 
				+		size_t global = nnz;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					kernel,
			
 
				+					1,
			
 
				+					NULL,
			
 
				+					&global,
			
 
				+					&local,
			
 
				+					0,
			
 
				+					NULL,
			
 
				+					&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	err = clEnqueueReadBuffer(queue,
			
 
				+				  fail,
			
 
				+				  CL_TRUE,
			
 
				+				  0, 
			
 
				+				  sizeof(int),
			
 
				+				  &csr_config.copy_failed,
			
 
				+				  0,
			
 
				+				  NULL,
			
 
				+				  NULL);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+        starpu_opencl_unload_opencl(&opencl_program);
			
 
				+}
			
--- a/tests/datawizard/interfaces/csr/csr_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/csr/csr_opencl_kernel.cl
@@ -0,0 +1,29 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+__kernel void test_csr_opencl(__global int *val,
			
 
				+			      unsigned int nx,
			
 
				+			      __global int *err,
			
 
				+			      int factor)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i >=  nx)
			
 
				+		return;
			
 
				+
			
 
				+	if (val[i] != (i+1) * factor)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		val[i] = - val[i];
			
 
				+}
			
--- a/tests/datawizard/interfaces/matrix/matrix_cuda.cu
+++ b/tests/datawizard/interfaces/matrix/matrix_cuda.cu
@@ -0,0 +1,71 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+extern struct test_config matrix_config;
			
 
				+
			
 
				+__global__ void matrix_cuda(int *val, unsigned n, int *err, int factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i >= n)
			
 
				+		return;
			
 
				+
			
 
				+	if (val[i] != i*factor)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		val[i] = -val[i];
			
 
				+}
			
 
				+
			
 
				+extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	int factor;
			
 
				+	int *ret;
			
 
				+	int *val;
			
 
				+	cudaError_t error;
			
 
				+	unsigned int nx, ny, n;
			
 
				+
			
 
				+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	n = nx * ny;
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+	factor = *(int *) args;
			
 
				+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	error = cudaMalloc(&ret, sizeof(int));
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+	error = cudaMemcpy(ret,
			
 
				+			   &matrix_config.copy_failed,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyHostToDevice);
			
 
				+	if (error != cudaSuccess)
			
 
				+		STARPU_CUDA_REPORT_ERROR(error);
			
 
				+
			
 
				+        matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
			
 
				+
			
 
				+	error = cudaMemcpy(&matrix_config.copy_failed,
			
 
				+			   ret,
			
 
				+			   sizeof(int),
			
 
				+			   cudaMemcpyDeviceToHost);
			
 
				+	
			
 
				+	cudaFree(ret);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/tests/datawizard/interfaces/matrix/matrix_interface.c
+++ b/tests/datawizard/interfaces/matrix/matrix_interface.c
@@ -0,0 +1,145 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+#include "../../../helper.h"
			
 
				+
			
 
				+#define WIDTH  16
			
 
				+#define HEIGHT 16
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+static void test_matrix_cpu_func(void *buffers[], void *args);
			
 
				+#endif /* !STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void test_matrix_cuda_func(void *buffers[], void *_args);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void test_matrix_opencl_func(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static starpu_data_handle_t matrix_handle;
			
 
				+static starpu_data_handle_t matrix2_handle;
			
 
				+
			
 
				+struct test_config matrix_config =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	.cpu_func      = test_matrix_cpu_func,
			
 
				+#endif /* ! STARPU_USE_CPU */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func     = test_matrix_cuda_func,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_func   = test_matrix_opencl_func,
			
 
				+#endif
			
 
				+	.handle        = &matrix_handle,
			
 
				+	.dummy_handle  = &matrix2_handle,
			
 
				+	.copy_failed   = 0,
			
 
				+	.name          = "matrix_interface"
			
 
				+};
			
 
				+
			
 
				+static int matrix[WIDTH * HEIGHT];
			
 
				+static int matrix2[WIDTH * HEIGHT];
			
 
				+
			
 
				+static void
			
 
				+register_data(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	int size = WIDTH * HEIGHT;
			
 
				+	for (i = 0; i < size; i++)
			
 
				+		matrix[i] = i;
			
 
				+
			
 
				+	starpu_matrix_data_register(&matrix_handle,
			
 
				+				    0,
			
 
				+				    (uintptr_t) matrix,
			
 
				+				    WIDTH, /* ld */
			
 
				+				    WIDTH,
			
 
				+				    HEIGHT,
			
 
				+				    sizeof(matrix[0]));
			
 
				+	starpu_matrix_data_register(&matrix2_handle,
			
 
				+				    0,
			
 
				+				    (uintptr_t) matrix2,
			
 
				+				    WIDTH, /* ld */
			
 
				+				    WIDTH,
			
 
				+				    HEIGHT,
			
 
				+				    sizeof(matrix[0]));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(matrix_handle);
			
 
				+	starpu_data_unregister(matrix2_handle);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+test_matrix_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int *val;
			
 
				+	int factor;
			
 
				+	int i;
			
 
				+	unsigned int nx, ny;
			
 
				+
			
 
				+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+	factor = *(int *) args;
			
 
				+
			
 
				+	for (i = 0; i < nx*ny; i++)
			
 
				+	{
			
 
				+		if (val[i] != i * factor)
			
 
				+		{
			
 
				+			matrix_config.copy_failed = 1;
			
 
				+			return;
			
 
				+		}
			
 
				+		val[i] *= -1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	data_interface_test_summary *summary;
			
 
				+	struct starpu_conf conf =
			
 
				+	{
			
 
				+		.ncpus   = -1,
			
 
				+		.ncuda   = 2,
			
 
				+		.nopencl = 1
			
 
				+	};
			
 
				+
			
 
				+	if (starpu_init(&conf) == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+	register_data();
			
 
				+
			
 
				+	summary = run_tests(&matrix_config);
			
 
				+	if (!summary)
			
 
				+		exit(EXIT_FAILURE);
			
 
				+
			
 
				+	unregister_data();
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	data_interface_test_summary_print(stderr, summary);
			
 
				+
			
 
				+	return data_interface_test_summary_success(summary);
			
 
				+
			
 
				+enodev:
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/interfaces/matrix/matrix_opencl.c
+++ b/tests/datawizard/interfaces/matrix/matrix_opencl.c
@@ -0,0 +1,129 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "../test_interfaces.h"
			
 
				+
			
 
				+#define KERNEL_LOCATION "tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl"
			
 
				+
			
 
				+extern struct test_config matrix_config;
			
 
				+static struct starpu_opencl_program matrix_program;
			
 
				+
			
 
				+void test_matrix_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				+	int id, devid, factor, ret;
			
 
				+	unsigned int n;
			
 
				+
			
 
				+        cl_int             err;
			
 
				+	cl_kernel          kernel;
			
 
				+	cl_command_queue   queue;
			
 
				+	cl_event           event;
			
 
				+	cl_context         context;
			
 
				+	cl_mem             val, fail;
			
 
				+
			
 
				+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
			
 
				+						  &matrix_program,
			
 
				+						  NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+
			
 
				+	factor = *(int *)args;
			
 
				+	n = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+	n*= STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+	val = (cl_mem)STARPU_MATRIX_GET_DEV_HANDLE(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&matrix_program,
			
 
				+					"matrix_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
			
 
				+		sizeof(int), &matrix_config.copy_failed, &err);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	/* Setting args */
			
 
				+	int nargs;
			
 
				+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					sizeof(val), &val,
			
 
				+					sizeof(n), &n,
			
 
				+					sizeof(fail), &fail,
			
 
				+					sizeof(factor), &factor,
			
 
				+					0);
			
 
				+	if (nargs != 4)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					kernel,
			
 
				+					1,
			
 
				+					NULL,
			
 
				+					&global,
			
 
				+					&local,
			
 
				+					0,
			
 
				+					NULL,
			
 
				+					&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	err = clEnqueueReadBuffer(queue,
			
 
				+				  fail,
			
 
				+				  CL_TRUE,
			
 
				+				  0, 
			
 
				+				  sizeof(int),
			
 
				+				  &matrix_config.copy_failed,
			
 
				+				  0,
			
 
				+				  NULL,
			
 
				+				  NULL);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+        starpu_opencl_unload_opencl(&matrix_program);
			
 
				+}
			
 
				+
			
--- a/tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl
@@ -0,0 +1,31 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void matrix_opencl(__global int *val,
			
 
				+				 unsigned int nx,
			
 
				+				 __global int *err,
			
 
				+				 int factor)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+	if (i >= nx)
			
 
				+		return;
			
 
				+
			
 
				+	if (val[i] != i * factor)
			
 
				+		*err = i;
			
 
				+	else
			
 
				+		val[i] *= -1;
			
 
				+}
			
 
				+
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
+++ b/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c