Andra Hugo vor 13 Jahren
Ursprung
Commit
38e00e9d11
100 geänderte Dateien mit 7350 neuen und 1006 gelöschten Zeilen
  1. 63 1
      .gitignore
  2. 2 0
      AUTHORS
  3. 85 1
      ChangeLog
  4. 22 8
      Makefile.am
  5. 17 1
      README
  6. 43 159
      README.dev
  7. 21 0
      STARPU-VERSION
  8. 9 0
      TODO
  9. 0 95
      acinclude.m4
  10. 326 107
      configure.ac
  11. 2 2
      libstarpu.pc.in
  12. 35 0
      starpu-1.0.pc.in
  13. 1 1
      starpu-top/StarPU-Top-common.pri
  14. 0 2
      starpu-top/StarPU-Top-qwt-system.pri
  15. 2 0
      starpu-top/StarPU-Top-qwt-system.pri.in
  16. 1 1
      starpu-top/aboutdialog.ui
  17. 3 3
      starpu-top/communicationmanager.cpp
  18. 3 3
      starpu-top/communicationmanager.h
  19. 3 3
      starpu-top/communicationthread.cpp
  20. 1 1
      starpu-top/configurationmanager.h
  21. 1 1
      starpu-top/dataaggregatorwidget.h
  22. 1 1
      starpu-top/datawidget.h
  23. 2 4
      starpu-top/extradist
  24. 9 9
      starpu-top/ganttwidget.cpp
  25. 6 6
      starpu-top/ganttwidget.h
  26. 0 0
      starpu-top/images/starpu_top.png
  27. 1 1
      starpu-top/interactivewidget.h
  28. 11 11
      starpu-top/mainwindow.cpp
  29. 4 4
      starpu-top/mainwindow.h
  30. 3 3
      starpu-top/mainwindow.ui
  31. 1 1
      starpu-top/preferencesdialog.h
  32. 1 1
      starpu-top/resources.qrc
  33. 1 1
      starpu-top/sessionsetupmanager.h
  34. 7 7
      starpu-top/starputoptypes.h
  35. 7 7
      starpu-top/taskmanager.cpp
  36. 3 3
      starpu-top/taskmanager.h
  37. 1 0
      starpufft/.gitignore
  38. 97 0
      starpufft/Makefile.am
  39. 19 0
      starpufft/cuda_kernels.cu
  40. 19 0
      starpufft/cudaf_kernels.cu
  41. 156 0
      starpufft/cudax_kernels.cu
  42. 23 0
      starpufft/cudax_kernels.h
  43. 51 0
      starpufft/double.h
  44. 19 0
      starpufft/examples/test.c
  45. 19 0
      starpufft/examples/test_threads.c
  46. 19 0
      starpufft/examples/testf.c
  47. 19 0
      starpufft/examples/testf_threads.c
  48. 283 0
      starpufft/examples/testx.c
  49. 113 0
      starpufft/examples/testx_threads.c
  50. 51 0
      starpufft/float.h
  51. 27 0
      starpufft/libstarpufft.pc.in
  52. 27 0
      starpufft/starpufft-1.0.pc.in
  53. 19 0
      starpufft/starpufft.c
  54. 60 0
      starpufft/starpufft.h
  55. 21 0
      starpufft/starpufft_common.c
  56. 19 0
      starpufft/starpufftf.c
  57. 454 0
      starpufft/starpufftx.c
  58. 847 0
      starpufft/starpufftx1d.c
  59. 850 0
      starpufft/starpufftx2d.c
  60. 272 32
      tests/Makefile.am
  61. 23 3
      tests/cholesky/prio.r
  62. 23 3
      tests/cholesky/sched.r
  63. 0 65
      tests/core/multithreaded_init.c
  64. 0 121
      tests/core/task_wait_api.c
  65. 8 5
      tests/datawizard/acquire_cb.c
  66. 43 25
      tests/datawizard/acquire_cb_insert.c
  67. 46 18
      tests/datawizard/acquire_release.c
  68. 42 16
      tests/datawizard/acquire_release2.c
  69. 34 30
      tests/datawizard/copy.c
  70. 24 15
      tests/datawizard/critical_section_with_void_interface.c
  71. 66 32
      tests/datawizard/data_implicit_deps.c
  72. 46 30
      tests/datawizard/data_invalidation.c
  73. 48 39
      tests/datawizard/data_lookup.c
  74. 35 18
      tests/datawizard/dining_philosophers.c
  75. 174 0
      tests/datawizard/double_parameter.c
  76. 142 28
      tests/datawizard/dsm_stress.c
  77. 139 0
      tests/datawizard/gpu_register.c
  78. 23 17
      tests/datawizard/handle_to_pointer.c
  79. 102 0
      tests/datawizard/in_place_partition.c
  80. 78 31
      tests/datawizard/increment_redux.c
  81. 255 0
      tests/datawizard/increment_redux_lazy.c
  82. 102 30
      tests/datawizard/increment_redux_v2.c
  83. 70 0
      tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
  84. 198 0
      tests/datawizard/interfaces/bcsr/bcsr_interface.c
  85. 130 0
      tests/datawizard/interfaces/bcsr/bcsr_opencl.c
  86. 29 0
      tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
  87. 80 0
      tests/datawizard/interfaces/block/block_cuda.cu
  88. 163 0
      tests/datawizard/interfaces/block/block_interface.c
  89. 120 0
      tests/datawizard/interfaces/block/block_opencl.c
  90. 46 0
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  91. 106 0
      tests/datawizard/interfaces/copy_interfaces.c
  92. 68 0
      tests/datawizard/interfaces/csr/csr_cuda.cu
  93. 170 0
      tests/datawizard/interfaces/csr/csr_interface.c
  94. 130 0
      tests/datawizard/interfaces/csr/csr_opencl.c
  95. 29 0
      tests/datawizard/interfaces/csr/csr_opencl_kernel.cl
  96. 71 0
      tests/datawizard/interfaces/matrix/matrix_cuda.cu
  97. 145 0
      tests/datawizard/interfaces/matrix/matrix_interface.c
  98. 129 0
      tests/datawizard/interfaces/matrix/matrix_opencl.c
  99. 31 0
      tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl
  100. 0 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c

+ 63 - 1
.gitignore

@@ -3,7 +3,6 @@
 /config.status
 /autom4te.cache
 /libtool
-/libstarpu.pc
 /aclocal.m4
 /build-aux
 /GPATH
@@ -186,3 +185,66 @@ starpu.log
 /gcc-plugin/tests/lib-user
 /gcc-plugin/examples/matrix-mult
 /gcc-plugin/src/c-expr.c
+/gcc-plugin/tests/heap-allocated
+/gcc-plugin/tests/output-pointer
+/gcc-plugin/examples/vector_scal/vector_scal
+/doc/starpu.info-1
+/doc/starpu.info-2
+/examples/axpy/axpy
+/examples/basic_examples/mult_impl
+/examples/basic_examples/multiformat
+/examples/cg/cg
+/examples/cholesky/cholesky_grain_tag
+/examples/cholesky/cholesky_implicit
+/examples/cholesky/cholesky_tag
+/examples/cholesky/cholesky_tile_tag
+/examples/cpp/incrementer_cpp
+/examples/filters/custom_mf/custom_mf_filter
+/examples/filters/multiformat/multiformat_filter
+/examples/heat/heat
+/examples/lu/lu_example_double
+/examples/lu/lu_example_float
+/examples/lu/lu_implicit_example_double
+/examples/lu/lu_implicit_example_float
+/examples/mult/dgemm
+/examples/mult/sgemm
+/mpi/starpumpi-1.0.pc
+/socl/socl-1.0.pc
+/starpufft/starpufft-1.0.pc
+/tests/core/deprecated
+/tests/core/deprecated_buffer
+/tests/core/deprecated_func
+/tests/core/multiformat_data_release
+/tests/core/multiformat_handle_conversion
+/tests/core/starpu_init
+/tests/core/starpu_task_bundle
+/tests/core/starpu_worker_exists
+/tests/datawizard/copy
+/tests/datawizard/double_parameter
+/tests/datawizard/gpu_register
+/tests/datawizard/in_place_partition
+/tests/datawizard/increment_redux_lazy
+/tests/datawizard/interfaces/bcsr/bcsr_interface
+/tests/datawizard/interfaces/block/block_interface
+/tests/datawizard/interfaces/csr/csr_interface
+/tests/datawizard/interfaces/matrix/matrix_interface
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_worker
+/tests/datawizard/interfaces/multiformat/advanced/same_handle
+/tests/datawizard/interfaces/multiformat/multiformat_interface
+/tests/datawizard/interfaces/test_interfaces
+/tests/datawizard/interfaces/test_vector_interface
+/tests/datawizard/interfaces/variable/variable_interface
+/tests/datawizard/interfaces/vector/test_vector_interface
+/tests/datawizard/interfaces/void/void_interface
+/tests/datawizard/partition_lazy
+/tests/loader
+/tests/starpu_machine_display
+/tools/starpu_calibrate_bus.1
+/tools/starpu_machine_display.1
+/tools/starpu_perfmodel_display.1
+/tools/starpu_perfmodel_plot.1
+/starpu-1.0.pc
+/gcc-plugin/examples/cholesky/cholesky

+ 2 - 0
AUTHORS

@@ -12,3 +12,5 @@ Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
+Antoine Lucas <antoine.lucas.33@gmail.com>
+Pierre André Wacrenier <wacrenier@labri.fr>

+ 85 - 1
ChangeLog

@@ -1,3 +1,87 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+StarPU 1.0 (svn revision xxxx)
+==============================================
+The extensions-again release
+
+  * struct starpu_data_interface_ops --- operations on a data
+        interface --- define a new function pointer allocate_new_data
+        which creates a new data interface of the given type based on
+        an existing handle
+  * Make environment variables take precedence over the configuration
+        passed to starpu_init()
+  * Add man pages for some of the tools
+  * Add reduction mode to starpu_mpi_insert_task
+  * Add C++ application example in examples/cpp/
+  * Increase default value for STARPU_MAXCPUS -- Maximum number of
+        CPUs supported -- to 64.
+  * Libtool interface versioning has been included in libraries names
+        (libstarpu-1.0.so, libstarpumpi-1.0.so,
+        libstarpufft-1.0.so, libsocl-1.0.so)
+  * Enable by default the SOCL extension.
+  * Enable by default the GCC plug-in extension.
+  * Add a field named magic to struct starpu_task which is set when
+        initialising the task. starpu_task_submit will fail if the
+        field does not have the right value. This will hence avoid
+        submitting tasks which have not been properly initialised.
+  * Make where field for struct starpu_codelet optional. When unset, its
+	value will be automatically set based on the availability of the
+	different XXX_funcs fields of the codelet.
+  * Add a hook function pre_exec_hook in struct starpu_sched_policy.
+        The function is meant to be called in drivers. Schedulers
+        can use it to be notified when a task is about being computed.
+  * Define access modes for data handles into starpu_codelet and no longer
+	in starpu_task. Hence mark (struct starpu_task).buffers as
+	deprecated, and add (struct starpu_task).handles and (struct
+	starpu_codelet).modes
+  * Install headers under $includedir/starpu/1.0.
+  * Deprecate cost_model, and introduce cost_function, which is provided
+	with the whole task structure, the target arch and implementation
+	number
+  * Permit the application to provide its own size base for performance
+	models
+  * Fields xxx_func of struct starpu_codelet are made deprecated. One
+	should use instead fields xxx_funcs.
+  * Applications can provide several implementations of a codelet for the
+	same architecture.
+  * A new multi-format interface permits to use different binary formats
+	on CPUs & GPUs, the conversion functions being provided by the
+	application and called by StarPU as needed (and as less as
+	possible).
+  * Add a gcc plugin to extend the C interface with pragmas which allows to
+	easily define codelets and issue tasks.
+  * Add codelet execution time statistics plot.
+  * Add bus speed in starpu_machine_display.
+  * Add a StarPU-Top feedback and steering interface.
+  * Documentation improvement.
+  * Add a STARPU_DATA_ACQUIRE_CB which permits to inline the code to be
+	done.
+  * Permit to specify MPI tags for more efficient starpu_mpi_insert_task
+  * Add SOCL, an OpenCL interface on top of StarPU.
+  * Add gdb functions.
+  * Add complex support to LU example.
+  * Add an OpenMP fork-join example.
+  * Permit to use the same data several times in write mode in the
+	parameters of the same task.
+  * Some types were renamed for consistency. The tools/dev/rename.sh
+	script can be used to port code using former names. You can also
+	choose to include starpu_deprecated_api.h (after starpu.h) to keep
+	using the old types.
+
 StarPU 0.9 (svn revision 3721)
 ==============================================
 The extensions release
@@ -58,7 +142,7 @@ The asynchronous heterogeneous multi-accelerator release
     - Implement starpu_worker_get_count
     - Implement starpu_display_codelet_stats
     - Implement starpu_data_prefetch_on_node
-    - Expose the starpu_data_set_wb_mask function
+    - Expose the starpu_data_set_wt_mask function
   * Support nvidia (heterogeneous) multi-GPU
   * Add the data request mechanism
     - All data transfers use data requests now

+ 22 - 8
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +34,6 @@ if COND_OPT
 SUBDIRS += tests/opt examples/opt
 endif
 
-
 if BUILD_GCC_PLUGIN
 SUBDIRS += gcc-plugin
 endif
@@ -43,12 +42,16 @@ if BUILD_SCHED_CTX_HYPERVISOR
 SUBDIRS += sched_ctx_hypervisor
 endif
 
+if BUILD_STARPUFFT
+SUBDIRS += starpufft
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = libstarpu.pc
+pkgconfig_DATA = libstarpu.pc starpu-1.0.pc
 
-include_HEADERS = 				\
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 				\
 	include/starpu.h			\
-	include/starpu_config.h			\
 	include/starpu_data_filters.h		\
 	include/starpu_data_interfaces.h	\
 	include/starpu_task.h			\
@@ -57,13 +60,19 @@ include_HEADERS = 				\
 	include/starpu_data.h			\
 	include/starpu_perfmodel.h		\
 	include/starpu_util.h			\
+	include/starpu_fxt.h			\
 	include/starpu_cuda.h			\
 	include/starpu_opencl.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
-	include/starpu_top.h
+	include/starpu_top.h			\
+	include/starpu_deprecated_api.h         \
+	include/starpu_hash.h
+
+nodist_versinclude_HEADERS = 			\
+	include/starpu_config.h
 
 if BUILD_STARPU_TOP
 all-local:
@@ -86,6 +95,11 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README
-EXTRA_DIST = AUTHORS COPYING.LGPL README
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION
 
 include starpu-top/extradist
+
+showcheck:
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 17 - 1
README

@@ -1,3 +1,19 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 ++=================++
 || I. Introduction ||
 ++=================++
@@ -134,7 +150,7 @@ Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 ++==============++
 
 To upgrade your source code from older version (there were quite a few
-renamings), use the tools/rename.sh script
+renamings), use the tools/dev/rename.sh script
 
 ++===============++
 || VIII. Contact ||

+ 43 - 159
README.dev

@@ -1,169 +1,53 @@
-Installing StarPU on windows
-----------------------------
-
-If you are building from a tarball downloaded from the website, you can skip the
-cygwin part.
-
-1. Install cygwin
-
-http://cygwin.com/install.html
-
-Make sure the following packages are available:
-- (Devel)/subversion
-- (Devel)/libtool
-- (Devel)/gcc
-- (Devel)/make
-- your favorite editor (vi, emacs, ...)
-- (Devel)/gdb
-- (Archive)/zip
-- (Devel)/pkg-config
-
-2. Install mingw
-
-http://sourceforge.net/projects/mingw/
-
-3. Install hwloc (not mandatory)
-
-http://www.open-mpi.org/projects/hwloc
-
-4. Install Microsoft Visual C++ Studio Express
-
-   http://www.microsoft.com/express/Downloads
-
-   Add in your path the following directories.
-   (adjusting where necessary for the Installation location according to VC
-    version and on 64 and 32bit Windows versions)
-
-   On cygwin, with Visual C++ 2010 e.g.;
-
-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
-
-   On MingW, with Visual C++ 2010, e.g.;
-
-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
-
-   Try to call <lib.exe> and <link.exe> without any option to make sure these
-   dump their help output, else no .def or .lib file will be produced.
-
-5. Install GPU Drivers (not mandatory)
-
-  5.1 Install Cuda
-
-      http://developer.nvidia.com/object/cuda_3_2_downloads.html
-
-      You need to install at least the CUDA toolkit.
-
-      libtool is not able to find the libraries automatically, you
-      need to make some copies:
-
-      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
-      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
-      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
-      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
-      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
-
-      (and if the version of your CUDA driver is >= 3.2)
-
-      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
-
-      Add the CUDA bin directory in your path
-
-      export PATH=/cygdrive/c/CUDA/bin:$PATH
-
-      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
-      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
-      definition which makes linking fail completely. Replace the first
-      occurence of
-
-      #define CUDARTAPI
-
-      with
-
-      #ifdef _WIN32
-      #define CUDARTAPI __stdcall
-      #else
-      #define CUDARTAPI
-      #endif
-
-      While at it, you can also comment the __cdecl definition to avoid spurious
-      warnings.
-
-
-  5.2 Install OpenCL
-
-      http://developer.nvidia.com/object/opencl-download.html
-
-      You need to download the NVIDIA Drivers for your version of
-      Windows. Executing the file will extract all files in a given
-      directory. The the driver installation will start, it will fail
-      if no compatibles drivers can be found on your system.
-
-      Anyway, you should copy the *.dl_ files from the directory
-      (extraction path) in the bin directory of the CUDA installation
-      directory (the directory should be v3.2/bin/)
-
-  5.3 Install MsCompress
-
-      http://gnuwin32.sourceforge.net/packages/mscompress.htm
-
-      Go in the CUDA bin directory, uncompress .dl_ files and rename
-      them in .dll files
-
-      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
-      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
-
-If you are building from a tarball downloaded from the website, you can skip the
-autogen.sh part.
-
-6. Start autogen.sh from cygwin
-
-   cd starpu-trunk
-   ./autogen.sh
-
-7. Start a MinGW shell
-
-   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
-
-8. Configure, make, install from MinGW
-
-   If you have a non-english version of windows, use
-
-     export LANG=C
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+Contents
+========
+
+- Developer Warnings
+- Naming Conventions
+- Coding Style
+
+Developer Warnings
+------------------
 
-   else libtool has troubles parsing the translated output of the toolchain.
+They are enabled only if the STARPU_DEVEL environment variable is
+defined to a non-empty value, when calling configure.
 
-   cd starpu-trunk
-   mkdir build
-   cd build
-   ../configure --prefix=$PWD/target --disable-default-drand48 \
-        --with-hwloc=<HWLOC installation directory> \
-        --with-cuda-dir=<CUDA installation directory> \
-        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
-	--with-opencl-dir=<CUDA installation directory>
-   make
-   make install
+
 
-   Also convert a couple of files to CRLF:
+Naming Conventions
+------------------
 
-   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
-   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
-   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
+* Prefix names of public objects (types, functions, etc.) with "starpu"
 
-9. If you want your StarPU installation to be standalone, you need to
-   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
-   installation bin directory, as well as MinGW/bin/libpthread*dll
+* Prefix names of internal objects (types, functions, etc.) with "_starpu"
 
-   cp <CUDA directory>/bin/*dll target/bin
-   cp <HWLOC directory>/bin/*dll target/bin
-   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
+* Names for qualified types (struct, union, enum) do not end with _t, _s or similar.
+  Use _t only for typedef types, such as opaque public types, e.g
+       typedef struct _starpu_data_state* starpu_data_handle_t;
+  or
+       typedef uint64_t starpu_tag_t;
 
-   and set the StarPU bin directory in your path.
+* When a variable can only take a finite set of values, use an enum
+  type instead of defining macros for each of the values.
 
-   export PATH=<StarPU installation directory>/bin:$PATH
+
 
+Coding Style
+------------
 
-Developers warning
-------------------
-They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.
+* Curly braces always go on a new line

+ 21 - 0
STARPU-VERSION

@@ -0,0 +1,21 @@
+# -*- sh -*-
+
+# Versioning (SONAMEs) for StarPU libraries.
+
+# Libtool interface versioning (info "(libtool) Versioning").
+LIBSTARPU_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPU_INTERFACE_AGE=0	# set to CURRENT - PREVIOUS interface
+STARPU_EFFECTIVE_VERSION=1.0
+
+LIBSTARPUFFT_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUFFT_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPUFFT_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
+
+LIBSTARPUMPI_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUMPI_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPUMPI_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
+
+LIBSOCL_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSOCL_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSOCL_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface

+ 9 - 0
TODO

@@ -0,0 +1,9 @@
+
+Moving access modes for data handles from struct starpu_task to struct starpu_codelet
+=====================================================================================
+
+TODO list
+
+- Make struct starpu_buffer_descr private (or not, as it can still be used in tests and examples)
+
+- When cost_model is provided, but not cost_function, need to rebuild a struct starpu_buffer_descr

+ 0 - 95
acinclude.m4

@@ -1,95 +0,0 @@
-dnl Copyright (C) Free Software Foundation, Inc.
-dnl
-dnl This program is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU General Public License as published by
-dnl the Free Software Foundation; either version 2 of the License, or
-dnl (at your option) any later version.
-dnl 
-dnl This program is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-dnl GNU General Public License for more details.
-dnl 
-dnl You should have received a copy of the GNU General Public License
-dnl along with this program; if not, write to the Free Software
-dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-dnl
-dnl This test is taken from libgfortran
-
-dnl Check whether the target supports __sync_val_compare_and_swap.
-AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
-  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
-		 ac_cv_have_sync_val_compare_and_swap, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
-			[ac_cv_have_sync_val_compare_and_swap=yes],
-			[ac_cv_have_sync_val_compare_and_swap=no])])
-  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
-	      [Define to 1 if the target supports __sync_val_compare_and_swap])
-  fi])
-
-dnl Check whether the target supports __sync_bool_compare_and_swap.
-AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
-  AC_CACHE_CHECK([whether the target supports __sync_bool_compare_and_swap],
-		 ac_cv_have_sync_bool_compare_and_swap, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_bool_compare_and_swap(&foo, 0, 1);])],
-			[ac_cv_have_sync_bool_compare_and_swap=yes],
-			[ac_cv_have_sync_bool_compare_and_swap=no])])
-  if test $ac_cv_have_sync_bool_compare_and_swap = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP, 1,
-	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
-  fi])
-
-dnl Check whether the target supports __sync_fetch_and_add.
-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
-		 ac_cv_have_sync_fetch_and_add, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_fetch_and_add(&foo, 1);])],
-			[ac_cv_have_sync_fetch_and_add=yes],
-			[ac_cv_have_sync_fetch_and_add=no])])
-  if test $ac_cv_have_sync_fetch_and_add = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_ADD, 1,
-	      [Define to 1 if the target supports __sync_fetch_and_add])
-  fi])
-
-dnl Check whether the target supports __sync_fetch_and_or.
-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_OR], [
-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_or],
-		 ac_cv_have_sync_fetch_and_or, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_fetch_and_or(&foo, 1);])],
-			[ac_cv_have_sync_fetch_and_or=yes],
-			[ac_cv_have_sync_fetch_and_or=no])])
-  if test $ac_cv_have_sync_fetch_and_or = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_OR, 1,
-	      [Define to 1 if the target supports __sync_fetch_and_or])
-  fi])
-
-dnl Check whether the target supports __sync_lock_test_and_set.
-AC_DEFUN([STARPU_CHECK_SYNC_LOCK_TEST_AND_SET], [
-  AC_CACHE_CHECK([whether the target supports __sync_lock_test_and_set],
-		 ac_cv_have_sync_lock_test_and_set, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_lock_test_and_set(&foo, 1);])],
-			[ac_cv_have_sync_lock_test_and_set=yes],
-			[ac_cv_have_sync_lock_test_and_set=no])])
-  if test $ac_cv_have_sync_lock_test_and_set = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_LOCK_TEST_AND_SET, 1,
-	      [Define to 1 if the target supports __sync_lock_test_and_set])
-  fi])
-
-dnl Check whether the target supports __sync_synchronize.
-AC_DEFUN([STARPU_CHECK_SYNC_SYNCHRONIZE], [
-  AC_CACHE_CHECK([whether the target supports __sync_synchronize],
-		 ac_cv_have_sync_synchronize, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM(,
-			[__sync_synchronize();])],
-			[ac_cv_have_sync_synchronize=yes],
-			[ac_cv_have_sync_synchronize=no])])
-  if test $ac_cv_have_sync_synchronize = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_SYNCHRONIZE, 1,
-	      [Define to 1 if the target supports __sync_synchronize])
-  fi])

+ 326 - 107
configure.ac

@@ -1,9 +1,9 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011  INRIA
+# Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,20 +16,51 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AC_INIT([StarPU],0.9.2, [starpu-bugs@lists.gforge.inria.fr], starpu)
+AC_INIT([StarPU],1.0.0rc2, [starpu-devel@lists.gforge.inria.fr], starpu)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
+
+dnl Versioning.
+
+STARPU_MAJOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 1`"
+STARPU_MINOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 2`"
+AC_SUBST([STARPU_MAJOR_VERSION])
+AC_SUBST([STARPU_MINOR_VERSION])
+AC_SUBST([STARPU_EFFECTIVE_VERSION])
+AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION],
+  [Major version number of StarPU.])
+AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION],
+  [Major version number of StarPU.])
+
+. "$srcdir/STARPU-VERSION"
+AC_SUBST([LIBSTARPU_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPU_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPU_INTERFACE_AGE])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_AGE])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_AGE])
+AC_SUBST([LIBSOCL_INTERFACE_CURRENT])
+AC_SUBST([LIBSOCL_INTERFACE_REVISION])
+AC_SUBST([LIBSOCL_INTERFACE_AGE])
+
 AC_CANONICAL_SYSTEM
 
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl when they're available.
 m4_ifdef([AM_SILENT_RULES],
-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests])],
+  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
   [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
 
+m4_ifdef([AM_SILENT_RULES],
+  [AM_SILENT_RULES(yes)])
+
 AC_PREREQ(2.60)
 
 AC_PROG_CC
+AC_PROG_CXX
 AC_PROG_CPP
 AC_PROG_SED
 AC_PROG_LN_S
@@ -61,13 +92,18 @@ AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 AC_CHECK_SIZEOF([void *])
 SIZEOF_VOID_P=$ac_cv_sizeof_void_p
-if test x$SIZEOF_VOID_P = x4; then
-	case "$target" in
-	i386-*darwin*) CFLAGS+=" -march=i686 " ;;
-	esac
-fi
-
-
+case $SIZEOF_VOID_P in
+	4)
+		case "$target" in
+		i386-*darwin*) CFLAGS+=" -march=i686 " ;;
+		esac
+		STARPU_MS_LIB_ARCH=X86
+		;;
+	8)
+		STARPU_MS_LIB_ARCH=X64
+		;;
+esac
+AC_SUBST(STARPU_MS_LIB_ARCH)
 
 # This will be useful for program which use CUDA (and .cubin files) which need
 # some path to the CUDA code at runtime.
@@ -122,8 +158,14 @@ else
   AC_DEFINE([starpu_erand48_r(xsubi, buffer, result)],[do {*(result) = ((double)(rand()) / RAND_MAX);} while (0);],[erand48_r equivalent function])
 fi
 
+# Some systems do not define strerror_r
+AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
+
+# Some systems do not define unsetenv
+AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
+
 # Define slow machine
-AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--disable-slow-machine],
+AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--enable-slow-machine],
 				   [Lower default values for the testcases run by make check])],
 				   enable_slow_machine=$enableval, enable_slow_machine=false)
 if  test x$enable_slow_machine = xyes; then
@@ -132,6 +174,8 @@ fi
 
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
+AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 
@@ -198,7 +242,7 @@ AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hyper
 AC_MSG_CHECKING(maximum number of CPUs)
 AC_ARG_ENABLE(maxcpus, [AS_HELP_STRING([--enable-maxcpus=<number>],
 			[maximum number of CPUs])],
-			maxcpus=$enableval, maxcpus=16)
+			maxcpus=$enableval, maxcpus=64)
 AC_MSG_RESULT($maxcpus)
 AC_DEFINE_UNQUOTED(STARPU_MAXCPUS, [$maxcpus], [Maximum number of CPUs supported])
 
@@ -312,7 +356,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
     __cuda_include_dir=$2
     __cuda_lib_dir=$3
 
-    if test "$__cuda_dir" != "no" ; then
+    if test "$__cuda_dir" != "no" -a "$__cuda_dir" != "" ; then
 	AC_MSG_CHECKING(whether CUDA RT is available in $__cuda_dir)
     else
 	AC_MSG_CHECKING(whether CUDA RT is available)
@@ -349,8 +393,8 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
         if test "$have_valid_cuda" = "no" ; then
             if test "$3" = "no" -a "$__cuda_dir" != "no" ; then
                 __cuda_lib_dir="$__cuda_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
 	        STARPU_CUDA_LDFLAGS="${SAVED_STARPU_CUDA_LDFLAGS} -L$__cuda_lib_dir"
-	        LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
 	        AC_HAVE_LIBRARY([cudart],[have_valid_cuda=yes],[have_valid_cuda=no])
                 unset ac_cv_lib_cudart_main
             fi
@@ -359,6 +403,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
     if test "$have_valid_cuda" = "yes" ; then
         STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcudart"
+	LDFLAGS="${SAVED_LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
 	# we also check that CUBLAS is available
 	AC_HAVE_LIBRARY([cublas],[have_valid_cuda=yes],[have_valid_cuda=no])
         unset ac_cv_lib_cublas_main
@@ -379,7 +424,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
     STARPU_CHECK_CUDA($cuda_dir, $cuda_lib_dir)
     if test "$have_valid_cuda" = "no" ; then
-        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
+        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
             STARPU_CHECK_CUDA($f, "no")
             if test "$have_valid_cuda" = "yes" ; then
                 break
@@ -390,7 +435,7 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
     if test "$have_valid_cuda" = "yes" ; then
         STARPU_CHECK_CUDA_RUNTIME($cuda_dir, $cuda_include_dir, $cuda_lib_dir)
         if test "$have_valid_cuda" = "no" ; then
-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
+            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
                 STARPU_CHECK_CUDA_RUNTIME($f, "no", "no")
                 if test "$have_valid_cuda" = "yes" ; then
                     break
@@ -399,8 +444,24 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
         fi
     fi
 
+    # Check cuda is compatible with the C compiler
+    AC_MSG_CHECKING(whether CUDA is working)
+    if test "$have_valid_cuda" = "yes" ; then
+        SAVED_CPPFLAGS="${CPPFLAGS}"
+        CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+		[[#include <cuda.h>]],
+		[[]]
+		),
+	    [have_valid_cuda="yes"],
+	    [have_valid_cuda="no"]
+	])
+        CPPFLAGS="${SAVED_CPPFLAGS}"
+    fi
+    AC_MSG_RESULT($have_valid_cuda)
+
     # in case CUDA was explicitely required, but is not available, this is an error
-    if test x$enable_cuda = xyes -a x$have_valid_cuda = no; then
+    if test x$enable_cuda = xyes -a x$have_valid_cuda = xno; then
 	AC_MSG_ERROR([cannot find CUDA])
     fi
     # now we enable CUDA if and only if a proper setup is available
@@ -609,21 +670,28 @@ AC_ARG_WITH(opencl-lib-dir,
 		enable_opencl=yes
 	], [opencl_lib_dir=no])
 
-if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
-    	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
-        if test "$have_valid_opencl" = "no" ; then
-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
-                if test -n $f ; then
-    	            STARPU_CHECK_OPENCL($f, "no", "no")
-                    if test "$have_valid_opencl" = "yes" ; then
-                        break
-                    fi
-                fi
-            done
-        fi
+AC_DEFUN([STARPU_LOOK_FOR_OPENCL],
+[
+    	if test "x$has_opencl_being_checked" != "xyes" ; then
+    	    STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
+	    if test "$have_valid_opencl" = "no" ; then
+            	for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
+		    if test -n $f ; then
+    			STARPU_CHECK_OPENCL($f, "no", "no")
+			if test "$have_valid_opencl" = "yes" ; then
+			    break
+			fi
+		    fi
+		done
+	    fi
+	    has_opencl_being_checked=yes
+	fi
+])
 
+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
+	STARPU_LOOK_FOR_OPENCL()
 	# in case OpenCL was explicitely required, but is not available, this is an error
-	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
+	if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
 	    AC_MSG_ERROR([cannot find OpenCL])
 	fi
 
@@ -684,7 +752,7 @@ if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
 	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
 
 	# in case Gordon was explicitely required, but is not available, this is an error
-	if test x$enable_gordon = xyes -a x$have_valid_gordon = no; then
+	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
 		AC_MSG_ERROR([cannot find Gordon])
 	fi
 
@@ -727,6 +795,7 @@ AC_MSG_RESULT($enable_debug)
 
 if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
 else
 	CFLAGS="$CFLAGS -O3"
 fi
@@ -741,6 +810,14 @@ if test x$enable_fast = xyes; then
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 fi
 
+AC_MSG_CHECKING(whether memory status should be displayed)
+AC_ARG_ENABLE(memory-status, [AS_HELP_STRING([--enable-memory-status],
+			     [display memory status at the end of execution])],
+			     enable_memory_status=$enableval, enable_memory_status=no)
+AC_MSG_RESULT($enable_memory_status)
+if test x$enable_memory_status = xyes; then
+        AC_DEFINE(STARPU_MEMORY_STATUS, [1], [display memory status])
+fi
 
 
 AC_MSG_CHECKING(whether debug messages should be displayed)
@@ -927,7 +1004,7 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of worker
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
 		[maximum number of implementations])],
-		maximplementations=$enableval, maximplementations=1)
+		maximplementations=$enableval, maximplementations=4)
 AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
@@ -1031,45 +1108,63 @@ fi
 #                                                                             #
 ###############################################################################
 
-build_starpu_top=no
-AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
-if test x$QMAKE != xnot-found; then
-	QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
-	if test $QMAKE_VERSION -ge 2 ; then
-		PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
-			QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
-			QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
-			if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
-				build_starpu_top=yes
-			fi
-			QWT_PRI=embed
-			AC_ARG_WITH(qwt-include-dir,
-				[AS_HELP_STRING([--with-qwt-include-dir=<path>],
-				[specify installed libqwt include path])],
-				[
-					STARPU_QWT_CPPFLAGS="-I$withval"
-					AC_SUBST(STARPU_QWT_CPPFLAGS)
-					QWT_PRI=system
-				])
-			AC_ARG_WITH(qwt-lib-dir,
-				[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
-				[specify installed libqwt library path])],
-				[
-					STARPU_QWT_LDFLAGS="-L$withval"
-					QWT_PRI=system
-				])
-			AC_ARG_WITH(qwt-lib,
-				[AS_HELP_STRING([--with-qwt-lib=<path>],
-				[specify installed libqwt library name])],
-				[
-					STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
-					QWT_PRI=system
-				])
-			AC_SUBST(QWT_PRI)
-		])
+AC_ARG_ENABLE([starpu-top],
+  [AS_HELP_STRING([--disable-starpu-top],
+    [build StarPU-Top])],
+  [enable_starpu_top="no"],
+  [enable_starpu_top="maybe"])
+
+# Check whether StarPU-Top can be built
+AC_MSG_CHECKING(for StarPU-Top)
+
+if test "x$enable_starpu_top" = "xmaybe" ; then
+	can_build_starpu_top=no
+	AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
+	if test x$QMAKE != xnot-found; then
+		QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
+		if test $QMAKE_VERSION -ge 2 ; then
+			PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
+				QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
+				QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
+				if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
+					can_build_starpu_top=yes
+				fi
+				QWT_PRI=embed
+				AC_ARG_WITH(qwt-include-dir,
+					[AS_HELP_STRING([--with-qwt-include-dir=<path>],
+					[specify installed libqwt include path])],
+					[
+						STARPU_QWT_INCLUDE="$withval"
+						AC_SUBST(STARPU_QWT_INCLUDE)
+						QWT_PRI=system
+					])
+				AC_ARG_WITH(qwt-lib-dir,
+					[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
+					[specify installed libqwt library path])],
+					[
+						STARPU_QWT_LDFLAGS="-L$withval"
+						QWT_PRI=system
+					])
+				AC_ARG_WITH(qwt-lib,
+					[AS_HELP_STRING([--with-qwt-lib=<name>],
+					[specify installed libqwt library name])],
+					[
+						STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
+						QWT_PRI=system
+					])
+				AC_SUBST(STARPU_QWT_LDFLAGS)
+				AC_SUBST(QWT_PRI)
+			])
+		fi
 	fi
 fi
 
+if test "x$enable_starpu_top" = "xmaybe" ; then
+  build_starpu_top=$can_build_starpu_top
+else
+  build_starpu_top=no
+fi
+
 AM_CONDITIONAL(BUILD_STARPU_TOP, test x$build_starpu_top = xyes)
 
 ###############################################################################
@@ -1088,7 +1183,7 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 	AC_MSG_CHECKING([whether compiler support $1])
 
 	SAVED_CFLAGS="$CFLAGS"
-	CFLAGS="$1 -we10006"
+	CFLAGS="$1" # -we10006"
 
 	AC_COMPILE_IFELSE(
 		AC_LANG_PROGRAM(
@@ -1117,6 +1212,11 @@ if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 fi
 
+# Same value as Automake's, for use in other places.
+pkglibdir="\${libdir}/$PACKAGE"
+AC_SUBST([pkglibdir])
+
+
 ###############################################################################
 #                                                                             #
 #                               GCC extensions                                #
@@ -1127,61 +1227,114 @@ AC_ARG_ENABLE([gcc-extensions],
   [AS_HELP_STRING([--enable-gcc-extensions],
     [build the GCC plug-in that provides C language extensions (experimental)])],
   [enable_gcc_plugin="$enableval"],
-  [enable_gcc_plugin="no"])
+  [enable_gcc_plugin="maybe"])
 
-if test "x$enable_gcc_plugin" = "xyes"; then
-   STARPU_GCC_PLUGIN_SUPPORT
+if test "x$enable_gcc_plugin" = "xyes" -o "x$enable_gcc_plugin" = "xmaybe" ; then
+    STARPU_GCC_PLUGIN_SUPPORT
 
-   if test "x$ac_cv_have_gcc_plugins" != "xyes"; then
-     AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
-   fi
+    if test "x$ac_cv_have_gcc_plugins" = "xno" ; then
+        if test "x$enable_gcc_plugin" = "xyes" ; then
+    	    # Since this was explicitly asked for, error out.
+            AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
+	else
+	    AC_MSG_WARN([GCC plug-ins not supported; StarPU's GCC plug-in will not be built])
+        fi
+    else
+        # What GCC version are we using?
+        STARPU_GCC_VERSION
+
+        # The `.so' itself cannot be called `starpu-gcc.so' (because
+	# `-fplugin-arg-' option names and such must match the `.so'
+	# name), so use a meaningful directory name.
+	gccplugindir="\${pkglibdir}/${STARPU_EFFECTIVE_VERSION}/gcc/${STARPU_GCC_VERSION_MAJOR}.${STARPU_GCC_VERSION_MINOR}"
+	AC_SUBST([gccplugindir])
+
+	# Lines to be inserted in the `.pc' file.
+	GCC_PLUGIN_DIR_PKGCONFIG="gccplugindir=$gccplugindir"
+	GCC_PLUGIN_PKGCONFIG="gccplugin=\${gccplugindir}/starpu.so"
+	AC_SUBST([GCC_PLUGIN_DIR_PKGCONFIG])
+	AC_SUBST([GCC_PLUGIN_PKGCONFIG])
+    fi
+fi
 
-   build_gcc_plugin="yes"
 
-   # GNU Guile 1.8/2.0 is used to run the test suite.
-   AC_PATH_PROG([GUILE], [guile])
-   if test "x$GUILE" != "x"; then
-      run_gcc_plugin_test_suite="yes"
-   else
-      run_gcc_plugin_test_suite="no"
-   fi
+if test "x$ac_cv_have_gcc_plugins" = "xyes" ; then
+    build_gcc_plugin="yes"
+
+    # GNU Guile 1.8/2.0 is used to run the test suite.
+    AC_PATH_PROG([GUILE], [guile])
+    if test "x$GUILE" != "x"; then
+        if test "x$enable_cpu" = "xyes"; then
+	   run_gcc_plugin_test_suite="yes"
+	else
+	   AC_MSG_WARN([CPU back-end disabled; GCC plug-in test suite will not be run])
+	   run_gcc_plugin_test_suite="no"
+	fi
+    else
+	run_gcc_plugin_test_suite="no"
+    fi
 else
-   build_gcc_plugin="no"
-   run_gcc_plugin_test_suite="no"
+    build_gcc_plugin="no"
+    run_gcc_plugin_test_suite="no"
 fi
 
 # Bison is used to generate the C expression parser.  The generated
 # parser is part of the distribution, though.
-AC_PROG_YACC
+AM_MISSING_PROG([YACC], [bison])
 
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 
 ###############################################################################
 #                                                                             #
-#                               OpenCL interface                              #
+#                               SOCL interface                                #
 #                                                                             #
 ###############################################################################
 
 AC_ARG_ENABLE([socl],
   [AS_HELP_STRING([--enable-socl],
-    [build the OpenCL interface (SOCL)])],
+    [build the OpenCL interface (experimental)])],
   [enable_socl="$enableval"],
-  [enable_socl="no"])
+  [enable_socl="maybe"])
 
-if test "x$enable_socl" = "xyes"; then
-   STARPU_SOCL_SUPPORT
-   build_socl="yes"
+AC_MSG_CHECKING(for SOCL)
+
+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
+    if test "$have_valid_opencl" = "no" ; then
+	STARPU_LOOK_FOR_OPENCL()
+    fi
+fi
+
+# in case SOCL was explicitely required, but is not available, this is an error
+if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
+    AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
+fi
+
+# now we enable SOCL if and only if a proper setup is available
+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
+   build_socl=$have_valid_opencl
 else
-   build_socl="no"
-   run_socl_test_suite="no"
+   build_socl=no
 fi
 
+AC_MSG_RESULT($build_socl)
 AM_CONDITIONAL([BUILD_SOCL], [test "x$build_socl" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SOCL], [test "x$build_socl" = "xyes"])
 
 ###############################################################################
 #                                                                             #
+#                                 Debugging                                   #
+#                                                                             #
+###############################################################################
+
+AC_PATH_PROG([GDB], [gdb], [not-found])
+if test "x$GDB" != "xnot-found"; then
+   AC_DEFINE_UNQUOTED([STARPU_GDB_PATH], ["$GDB"],
+     [Path to the GNU debugger.])
+fi
+
+###############################################################################
+#                                                                             #
 #                                  Examples                                   #
 #                                                                             #
 ###############################################################################
@@ -1203,10 +1356,10 @@ AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
 
 AC_PATH_XTRA
-if test "x$x_includes" != "xNONE"; then
+if test "x$no_x" != "xyes"; then
 	AC_DEFINE(STARPU_HAVE_X11, [1], [enable X11])
 fi
-AM_CONDITIONAL([HAVE_X11], [test "x$x_includes" != "xNONE"])
+AM_CONDITIONAL([HAVE_X11], [test "x$no_x" != "xyes"])
 
 # In case there are BLAS kernels that are used by the example applications
 # we may specify which library to use. Note that this is not used for StarPU
@@ -1330,6 +1483,11 @@ AC_SUBST(BLAS_LIB,$blas_lib)
 have_fftw=no
 have_fftwf=no
 have_fftwl=no
+fft_support=no
+
+AC_ARG_ENABLE(starpufft, [AS_HELP_STRING([--disable-starpufft],
+			[Disable build of StarPU-FFT])],
+			enable_starpufft=$enableval,enable_starpufft=yes)
 
 PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   AC_DEFINE([STARPU_HAVE_FFTW], [1], [Define to 1 if you have the libfftw3 library.])
@@ -1337,7 +1495,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   have_fftw=yes
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
- 
+
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
@@ -1352,6 +1510,11 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 
+if test x$enable_starpufft = xyes -a \( \( x$enable_cpu = xyes -a x$have_fftw = xyes -a x$have_fftwf = xyes \) -o x$have_cufftdoublecomplex = xyes \); then
+   fft_support=yes
+fi
+AM_CONDITIONAL(BUILD_STARPUFFT, [test x$fft_support = xyes])
+
 ##########################################
 # hwloc                                  #
 ##########################################
@@ -1407,10 +1570,56 @@ AC_ARG_ENABLE(optional_tests, [AS_HELP_STRING([--optional-tests],
 AC_MSG_RESULT($want_optional_tests)
 AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
+# Check if icc is available
+AC_CHECK_PROGS([ICC], [icc])
+
+# If cuda and icc are both available, check they are compatible
+if test "$enable_cuda" = "yes" -a "$ICC" != ""; then
+   AC_MSG_CHECKING(whether CUDA and ICC are compatible)
+   OLD_CC="$CC"
+   CC="$ICC"
+   AC_COMPILE_IFELSE(
+       AC_LANG_PROGRAM(
+	   [[#include <cuda.h>]],
+	   [[]]
+	   ),
+       AC_MSG_RESULT(yes),
+       [ICC=""
+           AC_MSG_RESULT(no)]
+   )
+   CC="$OLD_CC"
+fi
+
+# Disable ICC on windows
+if test "x$ICC" != "x" -a "$starpu_windows" = "yes" ; then
+    ICC=""
+fi
+if test "x$ICC" != "x"; then
+  AC_DEFINE(STARPU_HAVE_ICC, [], [Define this if icc is available])
+fi
+AM_CONDITIONAL([STARPU_HAVE_ICC], [test "x$ICC" != "x"])
+
+# Do not generate manpages for the tools if we do not have help2man
+AC_CHECK_PROGS([HELP2MAN], [help2man])
+# Disable on windows
+if test "$starpu_windows" = "yes" ; then
+    HELP2MAN=""
+fi
+AM_CONDITIONAL([STARPU_HAVE_HELP2MAN], [test "x$HELP2MAN" != "x"])
+
+AC_CHECK_MEMBER([struct cudaDeviceProp.pciDomainID],
+  AC_DEFINE([STARPU_HAVE_DOMAINID],[1],[Define to 1 if CUDA device properties include DomainID]),
+  , [[#include <cuda_runtime_api.h>]])
+
+AC_CHECK_MEMBER([struct cudaDeviceProp.pciBusID],
+  AC_DEFINE([STARPU_HAVE_BUSID],[1],[Define to 1 if CUDA device properties include BusID]),
+  , [[#include <cuda_runtime_api.h>]])
+
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x gcc-plugin/tests/run-test
+  chmod +x tools/starpu_workers_activity
 ])
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
@@ -1420,19 +1629,27 @@ AC_OUTPUT([
 	Makefile
 	src/Makefile
 	tools/Makefile
+	tools/starpu_workers_activity
 	socl/Makefile
 	socl/src/Makefile
+	socl/examples/Makefile
+        socl/socl-1.0.pc
 	libstarpu.pc
+	starpu-1.0.pc
+	mpi/libstarpumpi.pc
+	mpi/starpumpi-1.0.pc
+	starpufft/Makefile
+	starpufft/libstarpufft.pc
+	starpufft/starpufft-1.0.pc
 	examples/Makefile
         examples/opt/Makefile
-	examples/starpufft/Makefile
 	examples/stencil/Makefile
-	examples/socl/Makefile
 	tests/Makefile
         tests/opt/Makefile
 	doc/Makefile
 	mpi/Makefile
 	starpu-top/StarPU-Top.pro
+	starpu-top/StarPU-Top-qwt-system.pri
         gcc-plugin/Makefile
 	gcc-plugin/src/Makefile
 	gcc-plugin/tests/Makefile
@@ -1450,9 +1667,6 @@ AC_MSG_NOTICE([
 	OpenCL enabled: $enable_opencl
 	Cell   enabled: $enable_gordon
 
-	GCC plug-in: $build_gcc_plugin
-	GCC plug-in test suite: $run_gcc_plugin_test_suite
-
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	--enable-maxopencldev, --enable-maxbuffers)
@@ -1466,12 +1680,17 @@ AC_MSG_NOTICE([
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	Allocation cache:  $enable_allocation_cache
 
-	MPI enabled:   $use_mpi
-	SOCL enabled:  $build_socl
 	Magma enabled: $have_magma
 	BLAS library:  $blas_lib
 	hwloc:         $have_valid_hwloc
-
 	FxT trace enabled: $use_fxt
 	StarPU-Top:        $build_starpu_top
+
+	StarPU Extensions:
+	       MPI enabled:   $use_mpi
+	       MPI test suite: $running_mpi_check
+	       FFT Support: $fft_support
+	       GCC plug-in: $build_gcc_plugin
+	       GCC plug-in test suite: $run_gcc_plugin_test_suite
+	       SOCL enabled:  $build_socl
 ])

+ 2 - 2
libstarpu.pc.in

@@ -6,8 +6,8 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir} @STARPU_CUDA_CPPFLAGS@
-Libs: -L${libdir} -lstarpu @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@
 Requires.private: @GORDON_REQUIRES@

+ 35 - 0
starpu-1.0.pc.in

@@ -0,0 +1,35 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+pkglibdir=@pkglibdir@
+includedir=@includedir@
+
+# When the GCC plug-in is available, the following lines indicate
+# where it is installed.
+@GCC_PLUGIN_DIR_PKGCONFIG@
+@GCC_PLUGIN_PKGCONFIG@
+
+Name: starpu
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: @HWLOC_REQUIRES@
+Requires.private: @GORDON_REQUIRES@

+ 1 - 1
starpu-top/StarPU-Top-common.pri

@@ -28,7 +28,7 @@ SOURCES += $$SRCDIR/main.cpp \
     $$SRCDIR/aboutdialog.cpp
 HEADERS += $$SRCDIR/mainwindow.h \
 #STARPU-TOP
-    $$SRCDIR/starputoptypes.h \
+    $$SRCDIR/starpu_top_types.h \
     $$SRCDIR/widgetwindowsmanager.h \
     $$SRCDIR/configurationmanager.h \
     $$SRCDIR/communicationthread.h \

+ 0 - 2
starpu-top/StarPU-Top-qwt-system.pri

@@ -1,2 +0,0 @@
-LIBS += -lqwt-qt4
-INCLUDEPATH += /usr/include/qwt-qt4

+ 2 - 0
starpu-top/StarPU-Top-qwt-system.pri.in

@@ -0,0 +1,2 @@
+LIBS += @STARPU_QWT_LDFLAGS@
+INCLUDEPATH += @STARPU_QWT_INCLUDE@

+ 1 - 1
starpu-top/aboutdialog.ui

@@ -112,7 +112,7 @@
       <string/>
      </property>
      <property name="pixmap">
-      <pixmap resource="resources.qrc">:/images/starputop.png</pixmap>
+      <pixmap resource="resources.qrc">:/images/starpu_top.png</pixmap>
      </property>
      <property name="scaledContents">
       <bool>true</bool>

+ 3 - 3
starpu-top/communicationmanager.cpp

@@ -70,7 +70,7 @@ void CommunicationManager::initializeSession()
 {
     _dataDescriptions = new QList<DataDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
-    _serverDevices = new QList<StarputopDevice> ;
+    _serverDevices = new QList<starpu_top_device> ;
 
     _serverInfoMsgCount = 0;
     _state = COM_STATE_INIT;
@@ -665,7 +665,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
         Q_ASSERT_X(ok == true, "CommunicationManager::parseInitDevMessage()",
                    "Bogus message received in INIT DEV");
 
-        StarputopDeviceType deviceType;
+        starpu_top_device_type deviceType;
 
         Q_ASSERT_X(
                 deviceTypeString.compare(
@@ -701,7 +701,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
             deviceType = SERVERDEVICE_GORDON;
         }
 
-        StarputopDevice device;
+        starpu_top_device device;
         device.id = deviceId;
         device.type = deviceType;
         device.name = deviceNameString;

+ 3 - 3
starpu-top/communicationmanager.h

@@ -27,7 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #define COMMUNICATIONMANAGER_H
 
 #include <QTcpSocket>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 class CommunicationManager : public QTcpSocket
 { /* Receives protocol messages from server, parses them
@@ -54,7 +54,7 @@ private:
     qlonglong _serverTimestamp;
     QList<DataDescription*> *_dataDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
-    QList<StarputopDevice> *_serverDevices;
+    QList<starpu_top_device> *_serverDevices;
     // Communication states
     CommunicationState _state;
     bool _initServerInfoCompleted;
@@ -125,7 +125,7 @@ signals:
     void serverInitCompleted(QString serverID,
                              QList<DataDescription*> *dataDescriptions,
                              QList<ParamDescription*> *paramDescriptions,
-                             QList<StarputopDevice> *serverDevices);
+                             QList<starpu_top_device> *serverDevices);
     // Notify GUI with a protocol message
     // Protocol error
     void protocolError(QString errorMessage);

+ 3 - 3
starpu-top/communicationthread.cpp

@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "configurationmanager.h"
 #include "mainwindow.h"
 #include "communicationmanager.h"
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 const int MAX_CONNECTION_ATTEMPTS = 10;
 
@@ -103,12 +103,12 @@ void CommunicationThread::createNewCommunicationManager(void)
                      SIGNAL(serverInitCompleted(QString,
                                                 QList<DataDescription*>*,
                                                 QList<ParamDescription*>*,
-                                                QList<StarputopDevice>*)),
+                                                QList<Starpu_TopDevice>*)),
                      _mainWindow, SLOT(initClient(
                              QString,
                              QList<DataDescription*>*,
                              QList<ParamDescription*>*,
-                             QList<StarputopDevice>*)));
+                             QList<Starpu_TopDevice>*)));
     // Output data
     QObject::connect(_mainWindow, SIGNAL(clientLaunched()),
                      _communicationManager, SLOT(sendGoMessage()));

+ 1 - 1
starpu-top/configurationmanager.h

@@ -29,7 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <QSettings>
 
 static const QString CONFIG_FILE_DIR = ".";
-static const QString CONFIG_FILE_NAME = "starputop.cfg";
+static const QString CONFIG_FILE_NAME = "starpu_top.cfg";
 
 class ConfigurationManager
 { /* Contains and manages all the application settings

+ 1 - 1
starpu-top/dataaggregatorwidget.h

@@ -34,7 +34,7 @@ class QwtPlot;
 
 #include <QHash>
 #include <QAction>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include "abstractwidgetwindow.h"
 
 class DataAggregatorWidget : public AbstractWidgetWindow

+ 1 - 1
starpu-top/datawidget.h

@@ -31,7 +31,7 @@ class WidgetWindowsManager;
 class QwtPlotCurve;
 class QwtPlot;
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include "abstractwidgetwindow.h"
 
 class DataWidget : public AbstractWidgetWindow

+ 2 - 4
starpu-top/extradist

@@ -9,9 +9,8 @@ EXTRA_DIST	+=	\
                 starpu-top/abstractwidgetwindow.cpp     \
                 starpu-top/communicationthread.h        \
                 starpu-top/configurationmanager.cpp     \
-                starpu-top/starputoptypes.h             \
+                starpu-top/starpu_top_types.h             \
                 starpu-top/mainwindow.ui                \
-                starpu-top/debug                        \
                 starpu-top/mainwindow.cpp               \
                 starpu-top/sessionsetupmanager.cpp      \
                 starpu-top/resources.qrc                \
@@ -19,7 +18,7 @@ EXTRA_DIST	+=	\
                 starpu-top/images/connect.png           \
                 starpu-top/images/debugon.png           \
                 starpu-top/images/help.png              \
-                starpu-top/images/starputop.png         \
+                starpu-top/images/starpu_top.png         \
                 starpu-top/images/widget.png            \
                 starpu-top/images/lock.png              \
                 starpu-top/images/about.png             \
@@ -45,7 +44,6 @@ EXTRA_DIST	+=	\
                 starpu-top/debugconsole.ui                      \
                 starpu-top/dataaggregatorwidget.cpp             \
                 starpu-top/datawidget.cpp                       \
-                starpu-top/release                              \
                 starpu-top/datawidget.h                         \
                 starpu-top/debugconsole.cpp                     \
                 starpu-top/ganttwidget.h                        \

+ 9 - 9
starpu-top/ganttwidget.cpp

@@ -469,7 +469,7 @@ void GanttWidget::drawFromTime(QPainter *painter, qlonglong timestamp)
         borneBefore = 0;
     }
     _tasks = _taskManager->tasks(borneBefore, _timePresent);
-    foreach(StarputopTask t, _tasks)
+    foreach(starpu_top_task t, _tasks)
     {
 	drawWorkPU(painter,t);
     }
@@ -526,7 +526,7 @@ void GanttWidget::drawIdlePU(QPainter *painter)
 }
 
 /* draw forecasted working time for each processor */
-void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
+void GanttWidget::drawPrevWorkPU(QPainter *painter, starpu_top_task t)
 {
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
@@ -623,7 +623,7 @@ void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
  we haven't to test if they are displayable or not. We just have to calculate
  which part of time is displayable.
  The task t has its begin or its end between time Before and timePresent */
-void GanttWidget::drawWorkPU(QPainter *painter, StarputopTask t)
+void GanttWidget::drawWorkPU(QPainter *painter, starpu_top_task t)
 {
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
@@ -783,12 +783,12 @@ void GanttWidget::countPUs()
     _numPUs = length;
     delete _PUsByDevice;
     delete _PUsByPos;
-    _PUsByDevice = new StarputopDevice[length];
-    _PUsByPos = new StarputopDevice[length];
+    _PUsByDevice = new starpu_top_device[length];
+    _PUsByPos = new starpu_top_device[length];
     int pos = 0;
 
     /* CPUs */
-    foreach(StarputopDevice sD,*_mainWindow->serverDevices())
+    foreach(starpu_top_device sD,*_mainWindow->serverDevices())
     {
 	if(sD.type == 0)
 	{
@@ -806,7 +806,7 @@ void GanttWidget::countPUs()
     }
 
     /* GPUs */
-    foreach (StarputopDevice sD , *_mainWindow->serverDevices())
+    foreach (starpu_top_device sD , *_mainWindow->serverDevices())
     {
 	if(sD.type == 1 || sD.type == 2)
 	{
@@ -855,7 +855,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
             }
 
             _tasks = _taskManager->tasks(borneBefore, _timePresent);
-            foreach (StarputopTask t, _tasks)
+            foreach (starpu_top_task t, _tasks)
             {
                 drawWorkPU(painter,t);
             }
@@ -863,7 +863,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
             /* Future past */
             qlonglong borneAfter = _timePresent + _timeAfter;
             _tasks = _taskManager->prevTasks(_timePresent, borneAfter);
-            foreach		(StarputopTask t, _tasks)
+            foreach		(starpu_top_task t, _tasks)
             {
                 drawPrevWorkPU(painter,t);
             }

+ 6 - 6
starpu-top/ganttwidget.h

@@ -31,7 +31,7 @@ class TaskManager;
 
 #include <QGLWidget>
 #include <QPainter>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 class GanttWidget : public QGLWidget
 {
@@ -58,9 +58,9 @@ protected:
     void drawTime(QPainter *painter);
     void drawProgram(QPainter *painter);
     void resizeGL (int width,int height);
-    void drawWorkPU(QPainter *painter, StarputopTask t);
+    void drawWorkPU(QPainter *painter, starpu_top_task t);
     void drawIdlePU(QPainter *painter);
-    void drawPrevWorkPU(QPainter *painter, StarputopTask t);
+    void drawPrevWorkPU(QPainter *painter, starpu_top_task t);
     void defaultScreen(QPainter *painter);
     void drawPresentLine(QPainter *painter);
     int computeTimeInterval(int timeTotal);
@@ -84,7 +84,7 @@ private:
     qreal _coordxPresentLine;
     int _numPUs;
     bool _wasRunning;
-    QList<StarputopTask> _tasks;
+    QList<starpu_top_task> _tasks;
     int _timeTotal;
     int _timeAfter;
     int _timeBefore;
@@ -92,8 +92,8 @@ private:
     QTimer *_timer;
     qlonglong _timePresent;
     qlonglong _timeToShow;
-    StarputopDevice *_PUsByDevice;
-    StarputopDevice *_PUsByPos;
+    starpu_top_device *_PUsByDevice;
+    starpu_top_device *_PUsByPos;
     int _numCPUs;
     int _numGPUs;
     bool _initCompleted;

starpu-top/images/starputop.png → starpu-top/images/starpu_top.png


+ 1 - 1
starpu-top/interactivewidget.h

@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <QCloseEvent>
 #include <QLabel>
 #include <QHBoxLayout>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 class MainWindow;
 

+ 11 - 11
starpu-top/mainwindow.cpp

@@ -61,7 +61,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _dataAggregatorWidgets = new QList<QPointer<DataAggregatorWidget> > ();
     _dataDescriptions = new QList<DataDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
-    _serverDevices = new QList<StarputopDevice> ();
+    _serverDevices = new QList<starpu_top_device> ();
     _nbDataWidgets = _nbInteractiveWidgets = _nbDataAggregatorWidgets = 0;
 
     // Init managers
@@ -97,18 +97,18 @@ MainWindow::MainWindow(QWidget *parent) :
     QObject::connect(settingsAction, SIGNAL(triggered()), this,
                      SLOT(on_actionPreferences_triggered()));
     connectButton->addAction(settingsAction);
-    ui->menuStarputop->addAction(_actionConnect);
+    ui->menu_starpu_top->addAction(_actionConnect);
     // Action launch
     _actionLaunch = ui->mainToolBar->addAction(QIcon(":/images/play.png"),
                                                tr("Launch StarPU"));
     _actionLaunch->setIconText("Launch StarPU");
     _actionLaunch->setToolTip("Launch StarPU");
     _actionLaunch->setShortcut(QKeySequence("Ctrl+L"));
-    ui->menuStarputop->addAction(_actionLaunch);
+    ui->menu_starpu_top->addAction(_actionLaunch);
     QObject::connect(_actionLaunch, SIGNAL(triggered()), this,
                      SLOT(on_actionLaunch_StarPU_triggered()));
     ui->mainToolBar->addSeparator();
-    ui->menuStarputop->addSeparator();
+    ui->menu_starpu_top->addSeparator();
     // Action debug
     _actionDebug = ui->mainToolBar->addAction(QIcon(":/images/debugon.png"),
                                               tr("Enable debug"));
@@ -116,7 +116,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionDebug->setToolTip("Enable debug");
     _actionDebug->setShortcut(QKeySequence("Ctrl+D"));
     _actionDebug->setCheckable(true);
-    ui->menuStarputop->addAction(_actionDebug);
+    ui->menu_starpu_top->addAction(_actionDebug);
     QObject::connect(_actionDebug, SIGNAL(toggled(bool)),
                      this, SLOT(on_actionDebug_triggered(bool)));
     // Action save session setup
@@ -125,7 +125,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionSaveSessionSetup->setIconText("Save session setup");
     _actionSaveSessionSetup->setToolTip("Save session setup");
     _actionSaveSessionSetup->setShortcut(QKeySequence("Ctrl+S"));
-    ui->menuStarputop->addAction(_actionSaveSessionSetup);
+    ui->menu_starpu_top->addAction(_actionSaveSessionSetup);
     QObject::connect(_actionSaveSessionSetup, SIGNAL(triggered()), this,
                      SLOT(on_actionSaveSessionSetup_triggered()));
     // Action add data aggregator widget
@@ -135,13 +135,13 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionAddDataAggregatorWidget->setIconText("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setToolTip("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setShortcut(QKeySequence("Ctrl+G"));
-    ui->menuStarputop->addAction(_actionAddDataAggregatorWidget);
+    ui->menu_starpu_top->addAction(_actionAddDataAggregatorWidget);
     QObject::connect(_actionAddDataAggregatorWidget, SIGNAL(triggered()), this,
                      SLOT(on_actionAddDataAggregatorWidget_triggered()));
     ui->mainToolBar->addSeparator();
-    ui->menuStarputop->addSeparator();
+    ui->menu_starpu_top->addSeparator();
     // Action quit
-    QAction *actionQuit = ui->menuStarputop->addAction(
+    QAction *actionQuit = ui->menu_starpu_top->addAction(
             QIcon(":/images/quit.png"), tr("Quit"));
     actionQuit->setIconText("Quit");
     actionQuit->setToolTip("Quit");
@@ -540,7 +540,7 @@ void MainWindow::synchronizeSessionTime(qlonglong serverTimestamp)
 void MainWindow::initClient(QString serverID,
                             QList<DataDescription*> *dataDescriptions,
                             QList<ParamDescription*> *paramDescriptions,
-                            QList<StarputopDevice> *serverDevices)
+                            QList<starpu_top_device> *serverDevices)
 {
     _serverID = serverID;
     _dataDescriptions = dataDescriptions;
@@ -1213,7 +1213,7 @@ ParamDescription *MainWindow::paramDescriptionFromId(int paramId)
     return 0;
 }
 
-const QList<StarputopDevice> *MainWindow::serverDevices() const
+const QList<starpu_top_device> *MainWindow::serverDevices() const
 {
     return _serverDevices;
 }

+ 4 - 4
starpu-top/mainwindow.h

@@ -49,7 +49,7 @@ class TaskManager;
 #include <QAbstractSocket>
 #include <QTime>
 #include <QSpinBox>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 namespace Ui
 {
@@ -79,7 +79,7 @@ public:
     const QList<ParamDescription*> *paramDescriptions() const;
     DataDescription *dataDescriptionFromId(int dataId);
     ParamDescription *paramDescriptionFromId(int interactiveId);
-    const QList<StarputopDevice> *serverDevices() const;
+    const QList<starpu_top_device> *serverDevices() const;
     // Get different widgets metadata
     const QHash<DataWidgetType, QString> *dataWidgetNames() const;
     const QHash<DataType, QSet<DataWidgetType> >
@@ -166,7 +166,7 @@ private:
     // Different descriptions
     QList<DataDescription*> *_dataDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
-    QList<StarputopDevice> *_serverDevices;
+    QList<starpu_top_device> *_serverDevices;
     int _nbDataWidgets;
     int _nbDataAggregatorWidgets;
     int _nbInteractiveWidgets;
@@ -233,7 +233,7 @@ public slots:
     void initClient(QString serverID,
                     QList<DataDescription*> *dataDescriptions,
                     QList<ParamDescription*> *paramDescriptions,
-                    QList<StarputopDevice> *serverDevices);
+                    QList<starpu_top_device> *serverDevices);
     // Connection events handlers
     void connectionSucceeded();
     void connectionAborted(QString message);

+ 3 - 3
starpu-top/mainwindow.ui

@@ -21,7 +21,7 @@
   </property>
   <property name="windowIcon">
    <iconset resource="resources.qrc">
-    <normaloff>:/images/starputop.png</normaloff>:/images/starputop.png</iconset>
+    <normaloff>:/images/starpu_top.png</normaloff>:/images/starpu_top.png</iconset>
   </property>
   <widget class="QWidget" name="centralWidget">
    <layout class="QGridLayout" name="gridLayout_2">
@@ -42,7 +42,7 @@
      <height>21</height>
     </rect>
    </property>
-   <widget class="QMenu" name="menuStarputop">
+   <widget class="QMenu" name="menu_starpu_top">
     <property name="title">
      <string>StarPU-Top</string>
     </property>
@@ -59,7 +59,7 @@
     </property>
     <addaction name="actionPreferences"/>
    </widget>
-   <addaction name="menuStarputop"/>
+   <addaction name="menu_starpu_top"/>
    <addaction name="menuDisplay"/>
    <addaction name="menuHelp"/>
   </widget>

+ 1 - 1
starpu-top/preferencesdialog.h

@@ -33,7 +33,7 @@ class SessionSetupManager;
 #include <QMetaType>
 #include <QDialog>
 #include <QComboBox>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 namespace Ui
 {

+ 1 - 1
starpu-top/resources.qrc

@@ -13,7 +13,7 @@
         <file>images/add.png</file>
         <file>images/remove.png</file>
         <file>images/widget.png</file>
-        <file>images/starputop.png</file>
+        <file>images/starpu_top.png</file>
         <file>images/windows.png</file>
         <file>images/lock.png</file>
     </qresource>

+ 1 - 1
starpu-top/sessionsetupmanager.h

@@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 class MainWindow;
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include <QSettings>
 
 static const QString SESSION_SETUPS_DIR = "./sessionsetups";

+ 7 - 7
starpu-top/starputoptypes.h

@@ -23,8 +23,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
 
 
-#ifndef STARPUTOPTYPES_H
-#define STARPUTOPTYPES_H
+#ifndef STARPU_TOP_TYPES_H
+#define STARPU_TOP_TYPES_H
 
 #include <QString>
 #include <QStringList>
@@ -112,7 +112,7 @@ enum ParamType
     PARAM_TYPE_ENUM = 4,
 };
 
-enum StarputopDeviceType
+enum starpu_top_device_type
 {
     SERVERDEVICE_CPU = 0,
     SERVERDEVICE_CUDA = 1,
@@ -124,9 +124,9 @@ enum StarputopDeviceType
 typedef struct
 {
     int id;
-    StarputopDeviceType type;
+    starpu_top_device_type type;
     QString name;
-} StarputopDevice;
+} starpu_top_device;
 
 // Server tasks
 typedef struct
@@ -135,7 +135,7 @@ typedef struct
     int deviceId;
     qlonglong timestampStart;
     qlonglong timestampEnd;
-} StarputopTask;
+} starpu_top_task;
 
 // Descriptions
 typedef struct
@@ -318,4 +318,4 @@ typedef struct
     QList<int> dataIds;
 } DataAggregatorWidgetSetup;
 
-#endif // STARPUTOPTYPES_H
+#endif // STARPU_TOP_TYPES_H

+ 7 - 7
starpu-top/taskmanager.cpp

@@ -171,10 +171,10 @@ void TaskManager::addTaskEnd(int taskId, qlonglong timestampEnd)
     }
 }
 
-QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
-                                        qlonglong timestampEnd)
+QList<starpu_top_task> TaskManager::tasks(qlonglong timestampStart,
+					  qlonglong timestampEnd)
 {
-    QList < StarputopTask > tasks;
+    QList < starpu_top_task > tasks;
 
     _selectTasksQuery.addBindValue(timestampStart);
     _selectTasksQuery.addBindValue(timestampEnd);
@@ -206,7 +206,7 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
             qlonglong timestampEnd =
                     _selectTasksQuery.value(endField).toLongLong();
 
-            StarputopTask task;
+            starpu_top_task task;
             task.taskId = taskId;
             task.deviceId = deviceId;
             task.timestampStart = timestampStart;
@@ -220,10 +220,10 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
     return tasks;
 }
 
-QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
+QList<starpu_top_task> TaskManager::prevTasks(qlonglong timestampStart,
                                             qlonglong timestampEnd)
 {
-    QList < StarputopTask > prevTasks;
+    QList < starpu_top_task > prevTasks;
 
     _selectPrevTasksQuery.addBindValue(timestampStart);
     _selectPrevTasksQuery.addBindValue(timestampEnd);
@@ -255,7 +255,7 @@ QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
             qlonglong timestampEnd =
                     _selectPrevTasksQuery.value(endField).toLongLong();
 
-            StarputopTask prevTask;
+            starpu_top_task prevTask;
             prevTask.taskId = taskId;
             prevTask.deviceId = deviceId;
             prevTask.timestampStart = timestampStart;

+ 3 - 3
starpu-top/taskmanager.h

@@ -26,7 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #ifndef TASKMANAGER_H
 #define TASKMANAGER_H
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include <QDebug>
 #include <QtSql/QSqlDatabase>
 #include <QtSql/QSqlQuery>
@@ -46,9 +46,9 @@ public:
     void addTaskStart(int taskId, int deviceId, qlonglong timestampStart);
     void addTaskEnd(int taskId, qlonglong timestampEnd);
     // Getters
-    QList<StarputopTask> tasks(qlonglong timestampStart,
+    QList<starpu_top_task> tasks(qlonglong timestampStart,
                                qlonglong timestampEnd);
-    QList<StarputopTask> prevTasks(qlonglong timestampStart,
+    QList<starpu_top_task> prevTasks(qlonglong timestampStart,
                                    qlonglong timestampEnd);
 
 private:

+ 1 - 0
starpufft/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 97 - 0
starpufft/Makefile.am

@@ -0,0 +1,97 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+
+lib_LTLIBRARIES = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la
+
+EXTRA_DIST =			\
+	float.h			\
+	double.h		\
+	cudax_kernels.h		\
+	starpufftx.c		\
+	starpufftx1d.c		\
+	starpufftx2d.c		\
+	cuda_kernels.cu		\
+	cudaf_kernels.cu	\
+	cudax_kernels.cu	\
+	examples/testx.c	\
+	examples/testx_threads.c\
+	examples/testf_threads.c\
+	examples/test_threads.c
+
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 				\
+	starpufft.h
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpufft.pc starpufft-1.0.pc
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = starpufft.c starpufftf.c starpufft_common.c
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS) $(FFTWF_LIBS) $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUFFT_LDFLAGS)
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(FFTWF_CFLAGS)
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUFFT_INTERFACE_CURRENT):$(LIBSTARPUFFT_INTERFACE_REVISION):$(LIBSTARPUFFT_INTERFACE_AGE)
+
+if STARPU_USE_CUDA
+NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC
+
+cudaf_kernels.o: cudaf_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir}
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cudaf_kernels.cu
+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS = cudaf_kernels.o starpufft.lo starpufftf.lo starpufft_common.lo
+
+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
+cuda_kernels.o: cuda_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir} -arch sm_13
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cuda_kernels.cu
+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS += cuda_kernels.o
+endif
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD +=  $(STARPU_CUDA_LDFLAGS)
+endif
+
+examplebindir = $(libdir)/starpu/examples/starpufft
+examplebin_PROGRAMS =				\
+	examples/testf \
+	examples/test
+
+check_PROGRAMS = examples/testf
+examples_testf_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTWF_LIBS)
+
+# If we don't have CUDA, we assume that we have fftw available in double
+# precision anyway, we just want to make sure that if CUFFT is used, it also
+# supports double precision.
+if !STARPU_USE_CUDA
+check_PROGRAMS += examples/test
+else
+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
+check_PROGRAMS += examples/test
+endif
+endif
+examples_test_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS)
+
+TESTS = $(check_PROGRAMS)
+
+
+#check_PROGRAMS += examples/test_threads examples/testf_threads
+#examples_test_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3_threads
+#examples_testf_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3f_threads
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 19 - 0
starpufft/cuda_kernels.cu

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "cudax_kernels.cu"

+ 19 - 0
starpufft/cudaf_kernels.cu

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "cudax_kernels.cu"

+ 156 - 0
starpufft/cudax_kernels.cu

@@ -0,0 +1,156 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define _externC extern "C"
+#include "cudax_kernels.h"
+
+/* Note: these assume that the sizes are powers of two */
+
+#define VARS_1d \
+	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned numthreads = blockDim.x * gridDim.x;
+
+#define DISTRIB_1d(n, func,args) \
+	unsigned threads_per_block = 128; \
+\
+	if (n < threads_per_block) \
+	{			   \
+		dim3 dimGrid(n); \
+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+	} 					\
+	else 					\
+	{				     \
+		dim3 dimGrid(n / threads_per_block); \
+		dim3 dimBlock(threads_per_block); \
+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+	} \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n2;
+
+	for (j = start; j < end; j += numthreads)
+		twisted1[j] = in[i+j*n1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
+{
+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned n, unsigned i)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n;
+
+	for (j = start; j < end; j += numthreads)
+		out[j] = _cuCmul(out[j], roots[i*j]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
+{
+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
+}
+
+#define VARS_2d \
+	unsigned startx = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned starty = threadIdx.y + blockIdx.y * blockDim.y; \
+	unsigned numthreadsx = blockDim.x * gridDim.x; \
+	unsigned numthreadsy = blockDim.y * gridDim.y;
+
+/* FIXME: introduce threads_per_dim_n / m instead */
+#define DISTRIB_2d(n, m, func, args) \
+	unsigned threads_per_dim = 16; \
+	if (n < threads_per_dim) \
+	{				   \
+		if (m < threads_per_dim) \
+		{			    \
+			dim3 dimGrid(n, m); \
+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+		else \
+		{					      \
+			dim3 dimGrid(1, m / threads_per_dim); \
+			dim3 dimBlock(n, threads_per_dim); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+	} \
+	else \
+	{				   \
+		if (m < threads_per_dim) \
+		{					      \
+			dim3 dimGrid(n / threads_per_dim, 1); \
+			dim3 dimBlock(threads_per_dim, m); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+		else \
+		{							\
+			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
+			dim3 dimBlock(threads_per_dim, threads_per_dim); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+	} \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+	unsigned m = m1*m2;
+
+	for (k = startx; k < endx; k += numthreadsx)
+		for (l = starty; l < endy; l += numthreadsy)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _cuComplex * roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+
+	for (k = startx; k < endx ; k += numthreadsx)
+		for (l = starty; l < endy ; l += numthreadsy)
+			out[k*m2 + l] = _cuCmul(_cuCmul(out[k*m2 + l], roots0[i*k]), roots1[j*l]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
+}

+ 23 - 0
starpufft/cudax_kernels.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <cuComplex.h>
+#include <starpu_cuda.h>
+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);

+ 51 - 0
starpufft/double.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#undef  FLOAT
+#define DOUBLE
+
+typedef double real;
+#ifdef STARPU_HAVE_FFTW
+typedef fftw_complex _fftw_complex;
+typedef fftw_plan _fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+typedef cuDoubleComplex _cuComplex;
+typedef cufftDoubleComplex _cufftComplex;
+#define _cufftExecC2C cufftExecZ2Z
+#define _cufftExecR2C cufftExecD2Z
+#define _cufftExecC2R cufftExecZ2D
+#define _CUFFT_C2C CUFFT_Z2Z
+#define _CUFFT_R2C CUFFT_D2Z
+#define _CUFFT_C2R CUFFT_Z2D
+#define _cuCmul(x,y) cuCmul(x,y)
+#endif
+#define STARPUFFT(name) starpufft_##name
+#define _FFTW(name) fftw_##name
+
+#define TYPE ""

+ 19 - 0
starpufft/examples/test.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx.c"

+ 19 - 0
starpufft/examples/test_threads.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx_threads.c"

+ 19 - 0
starpufft/examples/testf.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx.c"

+ 19 - 0
starpufft/examples/testf_threads.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx_threads.c"

+ 283 - 0
starpufft/examples/testx.c

@@ -0,0 +1,283 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#undef STARPU_USE_CUDA
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#define SIGN (-1)
+/* #define SIGN (1) */
+
+#ifdef STARPU_HAVE_FFTW
+static void check_fftw(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++)
+	{
+		double diff = cabs(out[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-7 || relavgdiff > 1e-7)) {
+		fprintf(stderr, "Failure: Difference too big (TYPE f)\n");
+		exit(EXIT_FAILURE);
+	}
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+	{
+		fprintf(stderr, "Failure: Difference too big\n");
+		exit(EXIT_FAILURE);
+	}
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static void check_cuda(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++)
+	{
+		double diff = cabs(out_cuda[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		exit(EXIT_FAILURE);
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		exit(EXIT_FAILURE);
+}
+#endif
+
+int main(int argc, char *argv[])
+{
+	int i, ret;
+	int size;
+	int n = 0, m = 0;
+	STARPUFFT(plan) plan;
+	starpu_data_handle_t in_handle, out_handle;
+#ifdef STARPU_HAVE_FFTW
+	_FFTW(plan) fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+	cufftHandle cuda_plan;
+	cudaError_t cures;
+#endif
+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
+	struct timeval begin, end;
+	double timing;
+	size_t bytes;
+#endif
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (argc == 1)
+	{
+		n = 42;
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 2)
+	{
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 3)
+	{
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	}
+	else
+	{
+		assert(0);
+	}
+
+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
+	bytes = size * sizeof(STARPUFFT(complex));
+#endif
+
+	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
+	starpu_srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = starpu_drand48() + I * starpu_drand48();
+
+	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));
+
+#ifdef STARPU_HAVE_FFTW
+	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
+#endif
+
+#ifdef STARPU_USE_CUDA
+	STARPUFFT(complex) *out_cuda = STARPUFFT(malloc)(size * sizeof(*out_cuda));
+#endif
+
+	if (argc <= 2)
+	{
+		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
+#ifdef STARPU_HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef STARPU_USE_CUDA
+		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
+			printf("erf\n");
+#endif
+
+	}
+	else if (argc == 3)
+	{
+		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
+#ifdef STARPU_HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef STARPU_USE_CUDA
+		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
+#endif
+	}
+	else
+	{
+		assert(0);
+	}
+
+#ifdef STARPU_HAVE_FFTW
+	gettimeofday(&begin, NULL);
+	_FFTW(execute_dft)(fftw_plan, in, out_fftw);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+#ifdef STARPU_USE_CUDA
+	gettimeofday(&begin, NULL);
+	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
+		printf("erf2\n");
+	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(cures);
+	gettimeofday(&end, NULL);
+	cufftDestroy(cuda_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+
+	STARPUFFT(execute)(plan, in, out);
+	STARPUFFT(showstats)(stdout);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+
+#if 1
+	starpu_vector_data_register(&in_handle, 0, (uintptr_t) in, size, sizeof(*in));
+	starpu_vector_data_register(&out_handle, 0, (uintptr_t) out, size, sizeof(*out));
+
+	STARPUFFT(execute_handle)(plan, in_handle, out_handle);
+
+	starpu_data_unregister(in_handle);
+	starpu_data_unregister(out_handle);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+#endif
+
+	STARPUFFT(showstats)(stdout);
+	STARPUFFT(destroy_plan)(plan);
+
+	printf("\n");
+#if 0
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
+	printf("\n\n");
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
+	printf("\n\n");
+#ifdef STARPU_HAVE_FFTW
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
+	printf("\n\n");
+#endif
+#endif
+
+	STARPUFFT(free)(in);
+	STARPUFFT(free)(out);
+
+#ifdef STARPU_HAVE_FFTW
+	STARPUFFT(free)(out_fftw);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	free(out_cuda);
+#endif
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 113 - 0
starpufft/examples/testx_threads.c

@@ -0,0 +1,113 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#include <fftw3.h>
+
+#define SIGN (-1)
+/* #define SIGN (1) */
+
+int main(int argc, char *argv[])
+{
+	int i;
+	struct timeval begin, end;
+	int size;
+	size_t bytes;
+	int n = 0, m = 0;
+	_FFTW(plan) fftw_plan;
+	double timing;
+	char *num;
+	int num_threads = 1;
+
+	_FFTW(init_threads)();
+
+	num = getenv("NUM_THREADS");
+	if (num)
+		num_threads = atoi(num);
+	_FFTW(plan_with_nthreads)(num_threads);
+
+	if (argc < 2 || argc > 3)
+	{
+		fprintf(stderr,"need one or two size of vector\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (argc == 2)
+	{
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 3)
+	{
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	}
+	else
+	{
+		assert(0);
+	}
+
+	bytes = size * sizeof(_FFTW(complex));
+
+	_FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in));
+	starpu_srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = starpu_drand48() + I * starpu_drand48();
+
+	_FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw));
+
+	if (argc == 2)
+	{
+		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
+
+	}
+	else if (argc == 3)
+	{
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
+	}
+	else
+	{
+		assert(0);
+	}
+
+	gettimeofday(&begin, NULL);
+	_FFTW(execute)(fftw_plan);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads));
+
+	printf("\n");
+
+	return EXIT_SUCCESS;
+}

+ 51 - 0
starpufft/float.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#undef  DOUBLE
+#define FLOAT
+
+typedef float real;
+#ifdef STARPU_HAVE_FFTW
+typedef fftwf_complex _fftw_complex;
+typedef fftwf_plan _fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+typedef cuComplex _cuComplex;
+typedef cufftComplex _cufftComplex;
+#define _cufftExecC2C cufftExecC2C
+#define _cufftExecR2C cufftExecR2C
+#define _cufftExecC2R cufftExecC2R
+#define _CUFFT_C2C CUFFT_C2C
+#define _CUFFT_R2C CUFFT_R2C
+#define _CUFFT_C2R CUFFT_C2R
+#define _cuCmul(x,y) cuCmulf(x,y)
+#endif
+#define STARPUFFT(name) starpufftf_##name
+#define _FFTW(name) fftwf_##name
+
+#define TYPE "f"

+ 27 - 0
starpufft/libstarpufft.pc.in

@@ -0,0 +1,27 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpufft
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@

+ 27 - 0
starpufft/starpufft-1.0.pc.in

@@ -0,0 +1,27 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpufft
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@

+ 19 - 0
starpufft/starpufft.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "starpufftx.c"

+ 60 - 0
starpufft/starpufft.h

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include <starpu.h>
+
+#define STARPUFFT_FORWARD -1
+#define STARPUFFT_INVERSE 1
+
+#define __STARPUFFT(name) starpufft_##name
+#define __STARPUFFTF(name) starpufftf_##name
+#define __STARPUFFTL(name) starpufftl_##name
+
+#define __STARPUFFT_INTERFACE(starpufft,real) \
+typedef real _Complex starpufft(complex); \
+\
+typedef struct starpufft(plan) *starpufft(plan); \
+\
+starpufft(plan) starpufft(plan_dft_1d)(int n, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_2d)(int n, int m, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_r2c_1d)(int n, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
+\
+void *starpufft(malloc)(size_t n); \
+void starpufft(free)(void *p); \
+\
+void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
+struct starpu_task *starpufft(start)(starpufft(plan) p, void *in, void *out); \
+\
+void starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+struct starpu_task *starpufft(start_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+\
+void starpufft(cleanup)(starpufft(plan) p); \
+void starpufft(destroy_plan)(starpufft(plan) p); \
+\
+void starpufft(startstats)(void); \
+void starpufft(stopstats)(void); \
+void starpufft(showstats)(FILE *out);
+
+__STARPUFFT_INTERFACE(__STARPUFFT, double)
+__STARPUFFT_INTERFACE(__STARPUFFTF, float)
+__STARPUFFT_INTERFACE(__STARPUFFTL, long double)
+
+/* Internal use */
+extern int starpufft_last_plan_number;

+ 21 - 0
starpufft/starpufft_common.c

@@ -0,0 +1,21 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "starpufft.h"
+
+/* Used as an identifier in starpu tags to let plans run concurrently */
+int starpufft_last_plan_number;

+ 19 - 0
starpufft/starpufftf.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "starpufftx.c"

+ 454 - 0
starpufft/starpufftx.c

@@ -0,0 +1,454 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define PARALLEL 0
+
+#include <math.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+#include <config.h>
+
+#include "starpufft.h"
+#ifdef STARPU_USE_CUDA
+#define _externC extern
+#include "cudax_kernels.h"
+
+#if defined(FLOAT) || defined(STARPU_HAVE_CUFFTDOUBLECOMPLEX)
+#  define __STARPU_USE_CUDA
+#else
+#  undef __STARPU_USE_CUDA
+#endif
+
+#endif
+
+#define _FFTW_FLAGS FFTW_ESTIMATE
+
+/* Steps for the parallel variant */
+enum steps
+{
+	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
+};
+
+#define NUMBER_BITS 5
+#define NUMBER_SHIFT (64 - NUMBER_BITS)
+#define STEP_BITS 3
+#define STEP_SHIFT (NUMBER_SHIFT - STEP_BITS)
+
+/* Tags for the steps of the parallel variant */
+#define _STEP_TAG(plan, step, i) (((starpu_tag_t) plan->number << NUMBER_SHIFT) | ((starpu_tag_t)(step) << STEP_SHIFT) | (starpu_tag_t) (i))
+
+
+#define I_BITS STEP_SHIFT
+
+enum type
+{
+	R2C,
+	C2R,
+	C2C
+};
+
+static unsigned task_per_worker[STARPU_NMAXWORKERS];
+static unsigned samples_per_worker[STARPU_NMAXWORKERS];
+static struct timeval start, submit_tasks, end;
+
+/*
+ *
+ *	The actual kernels
+ *
+ */
+
+struct STARPUFFT(plan)
+{
+	int number;	/* uniquely identifies the plan, for starpu tags */
+
+	int *n;
+	int *n1;
+	int *n2;
+	int totsize;
+	int totsize1;	/* Number of first-round tasks */
+	int totsize2;	/* Size of first-round tasks */
+	int totsize3;	/* Number of second-round tasks */
+	int totsize4;	/* Size of second-round tasks */
+	int dim;
+	enum type type;
+	int sign;
+
+	STARPUFFT(complex) *roots[2];
+	starpu_data_handle_t roots_handle[2];
+
+	/* For each worker, we need some data */
+	struct
+	{
+#ifdef STARPU_USE_CUDA
+		/* CUFFT plans */
+		cufftHandle plan1_cuda, plan2_cuda;
+		/* Sequential version */
+		cufftHandle plan_cuda;
+#endif
+#ifdef STARPU_HAVE_FFTW
+		/* FFTW plans */
+		_fftw_plan plan1_cpu, plan2_cpu;
+		/* Sequential version */
+		_fftw_plan plan_cpu;
+#endif
+	} plans[STARPU_NMAXWORKERS];
+
+	/* Buffers for codelets */
+	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
+
+	/* corresponding starpu DSM handles */
+	starpu_data_handle_t in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle, out_handle;
+
+	/* Tasks */
+	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
+	struct starpu_task *join_task, *end_task;
+
+	/* Arguments for tasks */
+	struct STARPUFFT(args) *fft1_args, *fft2_args;
+};
+
+struct STARPUFFT(args)
+{
+	struct STARPUFFT(plan) *plan;
+	int i, j, jj, kk, ll, *iv, *kkv;
+};
+
+static void
+check_dims(STARPUFFT(plan) plan)
+{
+	int dim;
+	for (dim = 0; dim < plan->dim; dim++)
+		if (plan->n[dim] & (plan->n[dim]-1))
+		{
+			fprintf(stderr,"can't cope with non-power-of-2\n");
+			STARPU_ABORT();
+		}
+}
+
+static void
+compute_roots(STARPUFFT(plan) plan)
+{
+	int dim, k;
+
+	/* Compute the n-roots and m-roots of unity for twiddling */
+	for (dim = 0; dim < plan->dim; dim++)
+	{
+		STARPUFFT(complex) exp = (plan->sign * 2. * 4.*atan(1.)) * _Complex_I / (STARPUFFT(complex)) plan->n[dim];
+		plan->roots[dim] = malloc(plan->n[dim] * sizeof(**plan->roots));
+		for (k = 0; k < plan->n[dim]; k++)
+			plan->roots[dim][k] = cexp(exp*k);
+		starpu_vector_data_register(&plan->roots_handle[dim], 0, (uintptr_t) plan->roots[dim], plan->n[dim], sizeof(**plan->roots));
+
+#ifdef STARPU_USE_CUDA
+		if (plan->n[dim] > 100000)
+		{
+			/* prefetch the big root array on GPUs */
+			unsigned worker;
+			unsigned nworkers = starpu_worker_get_count();
+			for (worker = 0; worker < nworkers; worker++)
+			{
+				unsigned node = starpu_worker_get_memory_node(worker);
+				if (starpu_worker_get_type(worker) == STARPU_CUDA_WORKER)
+					starpu_data_prefetch_on_node(plan->roots_handle[dim], node, 0);
+			}
+		}
+#endif
+	}
+}
+
+/* Only CUDA capability >= 1.3 supports doubles, rule old card out.  */
+#ifdef DOUBLE
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) {
+	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+		return 1;
+#ifdef STARPU_USE_CUDA
+	{
+		/* Cuda device */
+		const struct cudaDeviceProp *props;
+		props = starpu_cuda_get_device_properties(workerid);
+		if (props->major >= 2 || props->minor >= 3)
+			/* At least compute capability 1.3, supports doubles */
+			return 1;
+		/* Old card does not support doubles */
+		return 0;
+	}
+#endif
+	return 0;
+}
+#define CAN_EXECUTE .can_execute = can_execute,
+#else
+#define CAN_EXECUTE
+#endif
+
+#include "starpufftx1d.c"
+#include "starpufftx2d.c"
+
+struct starpu_task *
+STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
+{
+	struct starpu_task *task;
+	int z;
+
+	plan->in = _in;
+	plan->out = _out;
+
+	switch (plan->dim)
+	{
+		case 1:
+		{
+			switch (plan->type)
+			{
+			case C2C:
+				starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+				if (!PARALLEL)
+					starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
+				if (PARALLEL)
+				{
+					for (z = 0; z < plan->totsize1; z++)
+						plan->twist1_tasks[z]->handles[0] = plan->in_handle;
+				}
+				task = STARPUFFT(start1dC2C)(plan, plan->in_handle, plan->out_handle);
+				break;
+			default:
+				STARPU_ABORT();
+				break;
+			}
+			break;
+		}
+		case 2:
+			starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (!PARALLEL)
+				starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (PARALLEL)
+			{
+				for (z = 0; z < plan->totsize1; z++)
+					plan->twist1_tasks[z]->handles[0] = plan->in_handle;
+			}
+			task = STARPUFFT(start2dC2C)(plan, plan->in_handle, plan->out_handle);
+			break;
+		default:
+			STARPU_ABORT();
+			break;
+	}
+	return task;
+}
+
+void
+STARPUFFT(cleanup)(STARPUFFT(plan) plan)
+{
+	if (plan->in_handle)
+		starpu_data_unregister(plan->in_handle);
+	if (!PARALLEL)
+	{
+		if (plan->out_handle)
+			starpu_data_unregister(plan->out_handle);
+	}
+}
+
+struct starpu_task *
+STARPUFFT(start_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	return STARPUFFT(start1dC2C)(plan, in, out);
+}
+
+void
+STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
+{
+	memset(task_per_worker, 0, sizeof(task_per_worker));
+	memset(samples_per_worker, 0, sizeof(task_per_worker));
+
+	gettimeofday(&start, NULL);
+
+	struct starpu_task *task = STARPUFFT(start)(plan, in, out);
+	gettimeofday(&submit_tasks, NULL);
+	starpu_task_wait(task);
+
+	STARPUFFT(cleanup)(plan);
+
+	gettimeofday(&end, NULL);
+}
+
+void
+STARPUFFT(execute_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	struct starpu_task *task = STARPUFFT(start_handle)(plan, in, out);
+	starpu_task_wait(task);
+}
+
+/* Destroy FFTW plans, unregister and free buffers, and free tags */
+void
+STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
+{
+	int workerid, dim, i;
+
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+	{
+		switch (starpu_worker_get_type(workerid))
+		{
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+			if (PARALLEL)
+			{
+				_FFTW(destroy_plan)(plan->plans[workerid].plan1_cpu);
+				_FFTW(destroy_plan)(plan->plans[workerid].plan2_cpu);
+			}
+			else
+			{
+				_FFTW(destroy_plan)(plan->plans[workerid].plan_cpu);
+			}
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+#ifdef STARPU_USE_CUDA
+			/* FIXME: Can't deallocate */
+#endif
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+
+	if (PARALLEL)
+	{
+		for (i = 0; i < plan->totsize1; i++)
+		{
+			starpu_data_unregister(plan->twisted1_handle[i]);
+			free(plan->twist1_tasks[i]);
+			starpu_data_unregister(plan->fft1_handle[i]);
+			free(plan->fft1_tasks[i]);
+		}
+
+		free(plan->twisted1_handle);
+		free(plan->twist1_tasks);
+		free(plan->fft1_handle);
+		free(plan->fft1_tasks);
+		free(plan->fft1_args);
+
+		free(plan->join_task);
+
+		for (i = 0; i < plan->totsize3; i++)
+		{
+			starpu_data_unregister(plan->twisted2_handle[i]);
+			free(plan->twist2_tasks[i]);
+			starpu_data_unregister(plan->fft2_handle[i]);
+			free(plan->fft2_tasks[i]);
+			free(plan->twist3_tasks[i]);
+		}
+
+		free(plan->end_task);
+
+		free(plan->twisted2_handle);
+		free(plan->twist2_tasks);
+		free(plan->fft2_handle);
+		free(plan->fft2_tasks);
+		free(plan->twist3_tasks);
+		free(plan->fft2_args);
+
+		for (dim = 0; dim < plan->dim; dim++)
+		{
+			starpu_data_unregister(plan->roots_handle[dim]);
+			free(plan->roots[dim]);
+		}
+
+		switch (plan->dim)
+		{
+		case 1:
+			STARPUFFT(free_1d_tags)(plan);
+			break;
+		case 2:
+			STARPUFFT(free_2d_tags)(plan);
+			break;
+		default:
+			STARPU_ABORT();
+			break;
+		}
+
+		free(plan->n1);
+		free(plan->n2);
+		STARPUFFT(free)(plan->twisted1);
+		STARPUFFT(free)(plan->fft1);
+		STARPUFFT(free)(plan->twisted2);
+		STARPUFFT(free)(plan->fft2);
+	}
+	free(plan->n);
+	free(plan);
+}
+
+void *
+STARPUFFT(malloc)(size_t n)
+{
+#ifdef STARPU_USE_CUDA
+	void *res;
+	starpu_malloc(&res, n);
+	return res;
+#else
+#  ifdef STARPU_HAVE_FFTW
+	return _FFTW(malloc)(n);
+#  else
+	return malloc(n);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(free)(void *p)
+{
+#ifdef STARPU_USE_CUDA
+	starpu_free(p);
+#else
+#  ifdef STARPU_HAVE_FFTW
+	_FFTW(free)(p);
+#  else
+	free(p);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(showstats)(FILE *out)
+{
+	int worker;
+	unsigned total;
+
+#define TIMING(begin,end) (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec))
+#define MSTIMING(begin,end) (TIMING(begin,end)/1000.)
+	double paratiming = TIMING(start,end);
+	fprintf(out, "Tasks submission took %2.2f ms\n", MSTIMING(start,submit_tasks));
+	fprintf(out, "Tasks termination took %2.2f ms\n", MSTIMING(submit_tasks,end));
+
+	fprintf(out, "Total %2.2f ms\n", MSTIMING(start,end));
+
+	for (worker = 0, total = 0; worker < starpu_worker_get_count(); worker++)
+		total += task_per_worker[worker];
+
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_worker_get_name(worker, name, sizeof(name));
+
+			unsigned long bytes = sizeof(STARPUFFT(complex))*samples_per_worker[worker];
+
+			fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%u %2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/paratiming, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+		}
+	}
+}

+ 847 - 0
starpufft/starpufftx1d.c

@@ -0,0 +1,847 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ *
+ * Dumb parallel version
+ *
+ */
+
+#define DIV_1D 64
+
+  /*
+   * Overall strategy for an fft of size n:
+   * - perform n1 ffts of size n2
+   * - twiddle
+   * - perform n2 ffts of size n1
+   *
+   * - n1 defaults to DIV_1D, thus n2 defaults to n / DIV_1D.
+   *
+   * Precise tasks:
+   *
+   * - twist1: twist the whole n-element input (called "in") into n1 chunks of
+   *           size n2, by using n1 tasks taking the whole n-element input as a
+   *           R parameter and one n2 output as a W parameter. The result is
+   *           called twisted1.
+   * - fft1:   perform n1 (n2) ffts, by using n1 tasks doing one fft each. Also
+   *           twiddle the result to prepare for the fft2. The result is called
+   *           fft1.
+   * - join:   depends on all the fft1s, to gather the n1 results of size n2 in
+   *           the fft1 vector.
+   * - twist2: twist the fft1 vector into n2 chunks of size n1, called twisted2.
+   *           since n2 is typically very large, this step is divided in DIV_1D
+   *           tasks, each of them performing n2/DIV_1D of them
+   * - fft2:   perform n2 ffts of size n1. This is divided in DIV_1D tasks of
+   *           n2/DIV_1D ffts, to be performed in batches. The result is called
+   *           fft2.
+   * - twist3: twist back the result of the fft2s above into the output buffer.
+   *           Only implemented on CPUs for simplicity of the gathering.
+   *
+   * The tag space thus uses 3 dimensions:
+   * - the number of the plan.
+   * - the step (TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END)
+   * - an index i between 0 and DIV_1D-1.
+   */
+
+#define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
+
+#ifdef __STARPU_USE_CUDA
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
+static void
+STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	
+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft1:
+ *
+ * Perform one fft of size n2 */
+static void
+STARPUFFT(fft1_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n2 = plan->n2[0];
+	int workerid = starpu_worker_get_id();
+	cufftResult cures;
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n2 = plan->n2[0];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	const _cufftComplex * restrict roots = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
+static void
+STARPUFFT(fft2_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+	cufftResult cures;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	/* NOTE using batch support */
+	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
+static void
+STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("twist1 %d %g\n", i, (double) cabs(plan->in[i])); */
+
+	for (j = 0; j < n2; j++)
+		twisted1[j] = in[i+j*n1];
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* fft1:
+ *
+ * Perform one fft of size n2 */
+static void
+STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n2 = plan->n2[0];
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft1 %d %g\n", i, (double) cabs(twisted1[0])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
+
+	/* twiddle fft1 buffer */
+	for (j = 0; j < n2; j++)
+		fft1[j] = fft1[j] * plan->roots[0][i*j];
+}
+#endif
+
+/* twist2:
+ *
+ * Twist the full vector (results of the fft1s) into one package of n2/DIV_1D
+ * chunks of size n1 */
+static void
+STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist2 %d %g\n", jj, (double) cabs(plan->fft1[jj])); */
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			twisted2[jjj*n1+i] = plan->fft1[i*n2+j];
+	}
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
+static void
+STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	/* int jj = args->jj; */
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft2 %d %g\n", jj, (double) cabs(twisted2[plan->totsize4-1])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
+}
+#endif
+
+/* twist3:
+ *
+ * Spread the package of n2/DIV_1D chunks of size n1 into the output vector */
+static void
+STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist3 %d %g\n", jj, (double) cabs(fft2[0])); */
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			plan->out[i*n2+j] = fft2[jjj*n1+i];
+	}
+}
+
+/* Performance models for the 5 kinds of tasks */
+static struct starpu_perfmodel STARPUFFT(twist1_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist1_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(fft1_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft1_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(twist2_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist2_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(fft2_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft2_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(twist3_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist3_1d"
+};
+
+/* codelet pointers for the 5 kinds of tasks */
+static struct starpu_codelet STARPUFFT(twist1_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(twist1_1d_kernel_gpu), NULL},
+#endif
+	.cpu_funcs = {STARPUFFT(twist1_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist1_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft1_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft1_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft1_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft1_1d_model),
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet STARPUFFT(twist2_1d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist2_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist2_1d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft2_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft2_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft2_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft2_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist3_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist3_1d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+/*
+ *
+ * Sequential version
+ *
+ */
+
+#ifdef __STARPU_USE_CUDA
+/* Perform one fft of size n */
+static void
+STARPUFFT(fft_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n = plan->n[0];
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft_1d_kernel_gpu)(void *descr[], void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform one fft of size n */
+static void
+STARPUFFT(fft_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	STARPUFFT(plan) plan = _args;
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
+}
+#endif
+
+static struct starpu_perfmodel STARPUFFT(fft_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft_1d"
+};
+
+static struct starpu_codelet STARPUFFT(fft_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+/* Planning:
+ *
+ * - For each CPU worker, we need to plan the two fftw stages.
+ * - For GPU workers, we need to do the planning in the CUDA context, so we do
+ *   this lazily through the initialised1 and initialised2 flags ; TODO: use
+ *   starpu_execute_on_each_worker instead (done in the omp branch).
+ * - We allocate all the temporary buffers and register them to starpu.
+ * - We create all the tasks, but do not submit them yet. It will be possible
+ *   to reuse them at will to perform several ffts with the same planning.
+ */
+STARPUFFT(plan)
+STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_1D;
+	int n2 = n / n1;
+	int n3;
+	int z;
+	struct starpu_task *task;
+
+if (PARALLEL) {
+#ifdef __STARPU_USE_CUDA
+	/* cufft 1D limited to 8M elements */
+	while (n2 > 8 << 20) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << I_BITS));
+
+	/* distribute the n2 second ffts into DIV_1D packages */
+	n3 = n2 / DIV_1D;
+	STARPU_ASSERT(n2 == n3*DIV_1D);
+}
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+if (PARALLEL) {
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* The plan number has a limited size */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+}
+
+	/* Just one dimension */
+	plan->dim = 1;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+
+if (PARALLEL) {
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+}
+
+	/* Note: this is for coherency with the 2D case */
+	plan->totsize = n;
+
+if (PARALLEL) {
+	plan->totsize1 = n1;
+	plan->totsize2 = n2;
+	plan->totsize3 = DIV_1D;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+}
+	plan->type = C2C;
+	plan->sign = sign;
+
+if (PARALLEL) {
+	/* Compute the w^k just once. */
+	compute_roots(plan);
+}
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+if (PARALLEL) {
+			/* first fft plan: one fft of size n2.
+			 * FFTW imposes that buffer pointers are known at
+			 * planning time. */
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3 ffts of size n1 */
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3,
+					NULL, NULL, 1, plan->totsize1,
+					(void*) 1, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+} else {
+			/* fft plan: one fft of size n. */
+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
+}
+#else
+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+#ifdef __STARPU_USE_CUDA
+if (PARALLEL) {
+	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);
+} else {
+	starpu_execute_on_each_worker(STARPUFFT(fft_1d_plan_gpu), plan, STARPU_CUDA);
+}
+#endif
+
+if (PARALLEL) {
+	/* Allocate buffers. */
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	/* Allocate handle arrays */
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	/* Allocate task arrays */
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	/* Allocate codelet argument arrays */
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks: DIV_1D tasks of type twist1 and fft1 */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
+
+		/* TODO: get rid of tags */
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+
+		/* Register the twisted1 buffer of size n2. */
+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		/* Register the fft1 buffer of size n2. */
+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need the result of fft1 on the CPU for the second
+		 * twist anyway, so tell starpu to not keep the fft1 buffer in
+		 * the GPU. */
+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_1d_codelet);
+		/* task->handles[0] = to be filled at execution to point
+		   to the application input. */
+		task->handles[1] = plan->twisted1_handle[z];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_1d_codelet);
+		task->handles[0] = plan->twisted1_handle[z];
+		task->handles[1] = plan->fft1_handle[z];
+		task->handles[2] = plan->roots_handle[0];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that the join task will depend on the fft1 task. */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create the join task, only serving as a dependency point between
+	 * fft1 and twist2 tasks */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks: DIV_1D batches of n2/DIV_1D twist2, fft2,
+	 * and twist3 */
+	for (z = 0; z < plan->totsize3; z++) {
+		int jj = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].jj = jj;
+
+		/* Register n3 twisted2 buffers of size n1 */
+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need the result of fft2 on the CPU for the third
+		 * twist anyway, so tell starpu to not keep the fft2 buffer in
+		 * the GPU. */
+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the join task */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_1D(plan, JOIN, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_1d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_1d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->handles[1] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_1d_codelet);
+		task->handles[0] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished
+		 * this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task, only serving as a join point. */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, END, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+}
+
+	return plan;
+}
+
+/* Actually submit all the tasks. */
+static struct starpu_task *
+STARPUFFT(start1dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+	int ret;
+
+if (PARALLEL) {
+	for (z=0; z < plan->totsize1; z++) {
+		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->join_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	for (z=0; z < plan->totsize3; z++) {
+		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->end_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	return plan->end_task;
+} else /* !PARALLEL */ {
+	struct starpu_task *task;
+
+	/* Create FFT task */
+	task = starpu_task_create();
+	task->detach = 0;
+	task->cl = &STARPUFFT(fft_1d_codelet);
+	task->handles[0] = in;
+	task->handles[1] = out;
+	task->cl_arg = plan;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return task;
+}
+}
+
+/* Free all the tags. The generic code handles freeing the buffers. */
+static void
+STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i;
+	int n1 = plan->n1[0];
+
+	if (!PARALLEL)
+		return;
+
+	for (i = 0; i < n1; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST1, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT1, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, JOIN, 0));
+
+	for (i = 0; i < DIV_1D; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST3, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, END, 0));
+}

+ 850 - 0
starpufft/starpufftx2d.c

@@ -0,0 +1,850 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define DIV_2D_N 8
+#define DIV_2D_M 8
+
+#define I_SHIFT (I_BITS/2)
+#define J_BITS I_SHIFT
+
+#define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
+
+#ifdef __STARPU_USE_CUDA
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft1:
+ *
+ * Perform one fft of size n2,m2 */
+static void
+STARPUFFT(fft1_2d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_worker_get_id();
+	cufftResult cures;
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	const _cufftComplex * restrict roots0 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
+	const _cufftComplex * restrict roots1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[3]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	/* synchronization is done after the twiddling */
+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft2:
+ *
+ * Perform n3*m3 ffts of size n1,m1 */
+static void
+STARPUFFT(fft2_2d_plan_gpu(void *args))
+{
+	STARPUFFT(plan) plan = args;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+	cufftResult cures;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int n;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	for (n = 0; n < n3*m3; n++) {
+		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	}
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int m = plan->n[1];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("twist1 %d %d %g\n", i, j, (double) cabs(plan->in[i+j])); */
+
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform an n2,m2 fft */
+static void
+STARPUFFT(fft1_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) *twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) *fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft1 %d %d %g\n", i, j, (double) cabs(twisted1[0])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			fft1[k*m2 + l] = fft1[k*m2 + l] * plan->roots[0][i*k] * plan->roots[1][j*l];
+}
+#endif
+
+/* Twist the full vector into a package of n2/DIV_2D_N,m2/DIV_2D_M (n1,m1) chunks */
+static void
+STARPUFFT(twist2_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist2 %d %d %g\n", kk, ll, (double) cabs(plan->fft1[kk+ll])); */
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					twisted2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j] = plan->fft1[i*n1*n2*m2+j*n2*m2+k*m2+l];
+		}
+	}
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) ffts */
+static void
+STARPUFFT(fft2_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	/* int kk = args->kk; */
+	/* int ll = args->ll; */
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) *twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) *fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft2 %d %d %g\n", kk, ll, (double) cabs(twisted2[plan->totsize4-1])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
+}
+#endif
+
+/* Spread the package of (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) chunks into the full vector */
+static void
+STARPUFFT(twist3_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int m = plan->n[1];
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist3 %d %d %g\n", kk, ll, (double) cabs(fft2[0])); */
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					plan->out[i*n2*m+j*m2+k*m+l] = fft2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j];
+		}
+	}
+}
+
+struct starpu_perfmodel STARPUFFT(twist1_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist1_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(fft1_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft1_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(twist2_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist2_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(fft2_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft2_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(twist3_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist3_2d"
+};
+
+static struct starpu_codelet STARPUFFT(twist1_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(twist1_2d_kernel_gpu), NULL},
+#endif
+	.cpu_funcs = {STARPUFFT(twist1_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist1_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft1_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft1_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft1_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft1_2d_model),
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_W, STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet STARPUFFT(twist2_2d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist2_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist2_2d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft2_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft2_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft2_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft2_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist3_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist3_2d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+/*
+ *
+ * Sequential version
+ *
+ */
+
+#ifdef __STARPU_USE_CUDA
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_2d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n = plan->n[0];
+	int m = plan->n[1];
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan_cuda, n, m, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft_2d_kernel_gpu)(void *descr[], void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	STARPUFFT(plan) plan = _args;
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
+}
+#endif
+
+static struct starpu_perfmodel STARPUFFT(fft_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft_2d"
+};
+
+static struct starpu_codelet STARPUFFT(fft_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+STARPUFFT(plan)
+STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_2D_N;
+	int n2 = n / n1;
+	int n3;
+	int m1 = DIV_2D_M;
+	int m2 = m / m1;
+	int m3;
+	int z;
+	struct starpu_task *task;
+
+if (PARALLEL) {
+	/*
+	 * Simple strategy:
+	 *
+	 * - twist1: twist input in n1*m1 (n2,m2) chunks
+	 * - fft1:   perform n1*m1 (n2,m2) ffts
+	 * - twist2: twist into n2*m2 (n1,m1) chunks distributed in
+	 *           DIV_2D_N*DIV_2D_M groups
+	 * - fft2:   perform DIV_2D_N*DIV_2D_M times n3*m3 (n1,m1) ffts
+	 * - twist3: twist back into output
+	 */
+
+#ifdef __STARPU_USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (n2 > 16384) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << J_BITS));
+
+
+#ifdef __STARPU_USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (m2 > 16384) {
+		m1 *= 2;
+		m2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(m == m1*m2);
+	STARPU_ASSERT(m1 < (1ULL << J_BITS));
+
+	/* distribute the n2*m2 second ffts into DIV_2D_N*DIV_2D_M packages */
+	n3 = n2 / DIV_2D_N;
+	STARPU_ASSERT(n2 == n3*DIV_2D_N);
+	m3 = m2 / DIV_2D_M;
+	STARPU_ASSERT(m2 == m3*DIV_2D_M);
+}
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+if (PARALLEL) {
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* 4bit limitation in the tag space */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+}
+
+	plan->dim = 2;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+	plan->n[1] = m;
+
+if (PARALLEL) {
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n1[1] = m1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+	plan->n2[1] = m2;
+}
+
+	plan->totsize = n * m;
+
+if (PARALLEL) {
+	plan->totsize1 = n1 * m1;
+	plan->totsize2 = n2 * m2;
+	plan->totsize3 = DIV_2D_N * DIV_2D_M;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+}
+	plan->type = C2C;
+	plan->sign = sign;
+
+if (PARALLEL) {
+	/* Compute the w^k just once. */
+	compute_roots(plan);
+}
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+if (PARALLEL) {
+			/* first fft plan: one n2*m2 fft */
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_2d)(n2, m2, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3*m3 n1*m1 ffts */
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3*m3,
+					NULL, NULL, 1, plan->totsize1,
+					(void*) 1, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+} else {
+			/* fft plan: one fft of size n, m. */
+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
+}
+#else
+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+#ifdef __STARPU_USE_CUDA
+if (PARALLEL) {
+	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);
+} else {
+	starpu_execute_on_each_worker(STARPUFFT(fft_2d_plan_gpu), plan, STARPU_CUDA);
+}
+#endif
+
+if (PARALLEL) {
+	/* Allocate buffers. */
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	/* Allocate handle arrays */
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	/* Allocate task arrays */
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	/* Allocate codelet argument arrays */
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z / m1, j = z % m1;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, i, j)
+
+		/* TODO: get rid of tags */
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+		plan->fft1_args[z].j = j;
+
+		/* Register (n2,m2) chunks */
+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need it on the CPU for the second twist anyway */
+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_2d_codelet);
+		/* task->handles[0] = to be filled at execution */
+		task->handles[1] = plan->twisted1_handle[z];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_2d_codelet);
+		task->handles[0] = plan->twisted1_handle[z];
+		task->handles[1] = plan->fft1_handle[z];
+		task->handles[2] = plan->roots_handle[0];
+		task->handles[3] = plan->roots_handle[1];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be done with first step we need to have
+		 * finished this fft1 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, JOIN, 0, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create join task */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, JOIN, 0, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks */
+	for (z = 0; z < plan->totsize3; z++) {
+		int kk = z / DIV_2D_M, ll = z % DIV_2D_M;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, kk, ll)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].kk = kk;
+		plan->fft2_args[z].ll = ll;
+
+		/* Register n3*m3 (n1,m1) chunks */
+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need it on the CPU for the last twist anyway */
+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the whole first step to be
+		 * done */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_2D(plan, JOIN, 0, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_2d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_2d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->handles[1] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_2d_codelet);
+		task->handles[0] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, END, 0, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, END, 0, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+}
+
+	return plan;
+}
+
+/* Actually submit all the tasks. */
+static struct starpu_task *
+STARPUFFT(start2dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+	int ret;
+
+if (PARALLEL) {
+	for (z=0; z < plan->totsize1; z++) {
+		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->join_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	for (z=0; z < plan->totsize3; z++) {
+		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->end_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	return plan->end_task;
+} else /* !PARALLEL */ {
+	struct starpu_task *task;
+
+	/* Create FFT task */
+	task = starpu_task_create();
+	task->detach = 0;
+	task->cl = &STARPUFFT(fft_2d_codelet);
+	task->handles[0] = in;
+	task->handles[1] = out;
+	task->cl_arg = plan;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return task;
+}
+}
+
+/* Free all the tags. The generic code handles freeing the buffers. */
+static void
+STARPUFFT(free_2d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i, j;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+
+	if (!PARALLEL)
+		return;
+
+	for (i = 0; i < n1; i++) {
+		for (j = 0; j < m1; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST1, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT1, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, JOIN, 0, 0));
+
+	for (i = 0; i < DIV_2D_N; i++) {
+		for (j = 0; j < DIV_2D_M; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST3, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, END, 0, 0));
+}

+ 272 - 32
tests/Makefile.am

@@ -1,8 +1,8 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
-# Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
+# Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,15 +16,28 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/src -I$(top_srcdir)/src/
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
 EXTRA_DIST =					\
+	helper.h				\
+	datawizard/scal.h			\
 	microbenchs/null_kernel_gordon.c	\
 	datawizard/sync_and_notify_data_gordon_kernels.c \
 	datawizard/sync_and_notify_data_opencl_codelet.cl\
-	coverage/coverage.sh
+	coverage/coverage.sh			\
+	datawizard/interfaces/test_interfaces.h	\
+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl \
+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl \
+	datawizard/interfaces/variable/variable_opencl_kernel.cl \
+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl \
+	datawizard/interfaces/multiformat/multiformat_types.h \
+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl \
+	datawizard/interfaces/multiformat/advanced/generic.h \
+	datawizard/interfaces/csr/csr_opencl_kernel.cl \
+	datawizard/interfaces/block/block_opencl_kernel.cl
 
 CLEANFILES = 					\
 	*.gcno *.gcda *.linkinfo		\
@@ -43,7 +56,7 @@ if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 
-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include $(HWLOC_CFLAGS)
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include $(HWLOC_CFLAGS)
 
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
@@ -83,7 +96,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/tests/$(LOADER)
-TESTS_ENVIRONMENT	=	$(LOADER_BIN)
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
 endif
 
 TESTS = $(noinst_PROGRAMS)
@@ -92,31 +105,39 @@ if STARPU_COVERAGE_ENABLED
 TESTS	+=	coverage/coverage.sh
 endif
 
+starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
+
 noinst_PROGRAMS =				\
-	core/restart				\
-	core/execute_on_a_specific_worker	\
-	core/insert_task			\
-	core/multithreaded			\
-	core/multithreaded_init			\
-	core/starpu_task_wait_for_all		\
-	core/starpu_task_wait			\
-	core/static_restartable			\
-	core/static_restartable_using_initializer\
-	core/static_restartable_tag		\
-	core/regenerate				\
-	core/wait_all_regenerable_tasks		\
-	core/subgraph_repeat			\
-	core/subgraph_repeat_regenerate		\
-	core/empty_task				\
-	core/empty_task_sync_point		\
-	core/empty_task_sync_point_tasks	\
-	core/empty_task_chain			\
-	core/tag_wait_api			\
-	core/task_wait_api			\
-	core/declare_deps_in_callback		\
-	core/declare_deps_after_submission	\
-	core/declare_deps_after_submission_synchronous	\
-	core/get_current_task			\
+	starpu_machine_display			\
+	main/deprecated_func			\
+	main/deprecated_buffer			\
+	main/restart				\
+	main/execute_on_a_specific_worker	\
+	main/insert_task			\
+	main/multithreaded			\
+	main/multithreaded_init			\
+	main/starpu_task_bundle			\
+	main/starpu_task_wait_for_all		\
+	main/starpu_task_wait			\
+	main/static_restartable			\
+	main/static_restartable_using_initializer\
+	main/static_restartable_tag		\
+	main/regenerate				\
+	main/wait_all_regenerable_tasks		\
+	main/subgraph_repeat			\
+	main/subgraph_repeat_regenerate		\
+	main/empty_task				\
+	main/empty_task_sync_point		\
+	main/empty_task_sync_point_tasks	\
+	main/empty_task_chain			\
+	main/tag_wait_api			\
+	main/task_wait_api			\
+	main/declare_deps_in_callback		\
+	main/declare_deps_after_submission	\
+	main/declare_deps_after_submission_synchronous	\
+	main/get_current_task			\
+	main/starpu_init			\
+	main/starpu_worker_exists               \
 	datawizard/acquire_cb			\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
@@ -128,6 +149,7 @@ noinst_PROGRAMS =				\
 	datawizard/sync_and_notify_data		\
 	datawizard/sync_and_notify_data_implicit\
 	datawizard/dsm_stress			\
+	datawizard/double_parameter		\
 	datawizard/write_only_tmp_buffer	\
 	datawizard/data_invalidation		\
 	datawizard/dining_philosophers		\
@@ -144,8 +166,26 @@ noinst_PROGRAMS =				\
 	datawizard/critical_section_with_void_interface\
 	datawizard/increment_redux		\
 	datawizard/increment_redux_v2		\
+	datawizard/increment_redux_lazy		\
 	datawizard/handle_to_pointer		\
 	datawizard/lazy_allocation		\
+	datawizard/interfaces/copy_interfaces	\
+	datawizard/interfaces/block/block_interface \
+	datawizard/interfaces/bcsr/bcsr_interface \
+	datawizard/interfaces/csr/csr_interface \
+	datawizard/interfaces/matrix/matrix_interface \
+	datawizard/interfaces/multiformat/multiformat_interface \
+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl \
+	datawizard/interfaces/multiformat/advanced/multiformat_data_release \
+	datawizard/interfaces/multiformat/advanced/multiformat_worker \
+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion \
+	datawizard/interfaces/multiformat/advanced/same_handle \
+	datawizard/interfaces/variable/variable_interface    \
+	datawizard/interfaces/vector/test_vector_interface   \
+	datawizard/interfaces/void/void_interface \
+	datawizard/in_place_partition   	\
+	datawizard/partition_lazy		\
+	datawizard/gpu_register   		\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_tasks		\
@@ -165,7 +205,7 @@ noinst_PROGRAMS =				\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
 	perfmodels/regression_based		\
-	perfmodels/non_linear_regression_based
+	perfmodels/non_linear_regression_based 
 
 if STARPU_HAVE_WINDOWS
 check_PROGRAMS = $(noinst_PROGRAMS)
@@ -236,6 +276,42 @@ datawizard_sync_and_notify_data_implicit_SOURCES +=	\
 	datawizard/sync_and_notify_data_opencl.c
 endif
 
+datawizard_in_place_partition_SOURCES =	\
+	datawizard/in_place_partition.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
+datawizard_partition_lazy_SOURCES =	\
+	datawizard/partition_lazy.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_partition_lazy_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_partition_lazy_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
+datawizard_gpu_register_SOURCES =	\
+	datawizard/gpu_register.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_gpu_register_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_gpu_register_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
 if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c
@@ -245,3 +321,167 @@ BUILT_SOURCES += 						\
 	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
 	microbenchs/null_kernel_gordon.spuelf
 endif
+
+###################
+# Block interface #
+###################
+datawizard_interfaces_block_block_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c  \
+	datawizard/interfaces/block/block_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_block_block_interface_SOURCES+= \
+	datawizard/interfaces/block/block_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_block_block_interface_SOURCES+= \
+	datawizard/interfaces/block/block_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/block/block_opencl_kernel.cl
+endif
+
+##################
+# BSCR interface #
+##################
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c \
+	datawizard/interfaces/bcsr/bcsr_interface.c 
+
+if STARPU_USE_CUDA
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
+	datawizard/interfaces/bcsr/bcsr_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
+	datawizard/interfaces/bcsr/bcsr_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
+endif
+
+#################
+# CSR interface #
+#################
+datawizard_interfaces_csr_csr_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c  \
+	datawizard/interfaces/csr/csr_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_csr_csr_interface_SOURCES+= \
+	datawizard/interfaces/csr/csr_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_csr_csr_interface_SOURCES+= \
+	datawizard/interfaces/csr/csr_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/csr/csr_opencl_kernel.cl
+endif
+
+
+datawizard_interfaces_vector_test_vector_interface_SOURCES =               \
+	datawizard/interfaces/vector/test_vector_interface.c               \
+	datawizard/interfaces/test_interfaces.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
+	datawizard/interfaces/vector/test_vector_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
+	datawizard/interfaces/vector/test_vector_opencl.c 
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl
+endif
+
+####################
+# Matrix interface #
+####################
+datawizard_interfaces_matrix_matrix_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c        \
+	datawizard/interfaces/matrix/matrix_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
+	datawizard/interfaces/matrix/matrix_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
+	datawizard/interfaces/matrix/matrix_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA+= \
+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl
+endif
+
+
+#########################
+# Multiformat interface #
+#########################
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES =           \
+	datawizard/interfaces/test_interfaces.c                             \
+	datawizard/interfaces/multiformat/multiformat_interface.c           \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
+	datawizard/interfaces/multiformat/multiformat_cuda.cu                      \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
+	datawizard/interfaces/multiformat/multiformat_opencl.c                     \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA +=                                                          \
+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl                     \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl
+endif
+
+datawizard_interfaces_multiformat_advanced_multiformat_cuda_opencl_SOURCES=\
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_data_release_SOURCES = \
+	datawizard/interfaces/multiformat/advanced/generic.c                  \
+	datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_worker_SOURCES=\
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/multiformat_worker.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_handle_conversion_SOURCES = \
+	datawizard/interfaces/multiformat/advanced/generic.c \
+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion.c
+
+datawizard_interfaces_multiformat_advanced_same_handle_SOURCES= \
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/same_handle.c
+
+
+datawizard_interfaces_variable_variable_interface_SOURCES=   \
+	datawizard/interfaces/test_interfaces.c              \
+	datawizard/interfaces/variable/variable_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_variable_variable_interface_SOURCES+= \
+	datawizard/interfaces/variable/variable_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_variable_variable_interface_SOURCES+= \
+	datawizard/interfaces/variable/variable_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/variable/variable_opencl_kernel.cl
+endif
+
+##################
+# Void interface #
+##################
+datawizard_interfaces_void_void_interface_SOURCES=\
+	datawizard/interfaces/test_interfaces.c        \
+	datawizard/interfaces/void/void_interface.c
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 23 - 3
tests/cholesky/prio.r

@@ -1,3 +1,20 @@
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 sizelist <- seq(2048, 24576, 2048);
 schedlist <- c("greedy", "prio", "dm", "random");
 
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 
 	if (file.exists(filename))
-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
+	{
+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
 		return(ret);
 	};
 
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 	gflopstab <- NULL;
 	sizetab <- NULL;
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- handle_size(size, sched);
 		gflopstab <- c(gflopstab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 	meantab <- NULL;
 	sizetab <- NULL;
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- mean(handle_size(size, sched));
 		meantab <- c(meantab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));

+ 23 - 3
tests/cholesky/sched.r

@@ -1,3 +1,20 @@
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 sizelist <- seq(2048, 24576, 2048);
 schedlist <- c("greedy", "prio", "dm", "random");
 
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 
 	if (file.exists(filename))
-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
+	{
+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
 		return(ret);
 	};
 
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 	gflopstab <- NULL;
 	sizetab <- NULL;
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- handle_size(size, sched);
 		gflopstab <- c(gflopstab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 	meantab <- NULL;
 	sizetab <- NULL;
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- mean(handle_size(size, sched));
 		meantab <- c(meantab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));

+ 0 - 65
tests/core/multithreaded_init.c

@@ -1,65 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-#include <sys/time.h>
-#include <stdio.h>
-#include <pthread.h>
-#include <starpu.h>
-
-#define NUM_THREADS 5
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
-void *launch_starpu(void *id)
-{ 
-   starpu_init(NULL);
-   return NULL;
-}
-
-int main(int argc, char **argv)
-{ 
-  unsigned i;
-  double timing;
-  struct timeval start;
-  struct timeval end;
-
-  pthread_t threads[NUM_THREADS];
-  
-  gettimeofday(&start, NULL);
-
-  for (i = 0; i < NUM_THREADS; ++i)
-    {
-      int ret = pthread_create(&threads[i], NULL, launch_starpu, NULL);
-      STARPU_ASSERT(ret == 0);
-    }
-
-  for (i = 0; i < NUM_THREADS; ++i)
-    {
-      int ret = pthread_join(threads[i], NULL);
-      STARPU_ASSERT(ret == 0);
-    }
-
-  gettimeofday(&end, NULL);
-
-  timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-  FPRINTF(stderr, "Success : %d threads launching simultaneously starpu_init\n", NUM_THREADS);
-  FPRINTF(stderr, "Total: %f secs\n", timing/1000000);
-  FPRINTF(stderr, "Per task: %f usecs\n", timing/NUM_THREADS);
-
-  starpu_shutdown();
-
-  return 0;
-}

+ 0 - 121
tests/core/task_wait_api.c

@@ -1,121 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <pthread.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
-static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
-{
-}
-
-static starpu_codelet dummy_codelet =
-{
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = dummy_func,
-	.cuda_func = dummy_func,
-	.opencl_func = dummy_func,
-        .model = NULL,
-	.nbuffers = 0
-};
-
-static struct starpu_task *create_dummy_task(void)
-{
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &dummy_codelet;
-	task->cl_arg = NULL;
-	task->detach = 0;
-
-	return task;
-}
-
-int main(int argc, char **argv)
-{
-	starpu_init(NULL);
-
-	FPRINTF(stderr, "{ A } -> { B }\n");
-	fflush(stderr);
-
-	struct starpu_task *taskA, *taskB;
-
-	taskA = create_dummy_task();
-	taskB = create_dummy_task();
-
-	/* B depends on A */
-	starpu_task_declare_deps_array(taskB, 1, &taskA);
-
-	starpu_task_submit(taskB);
-	starpu_task_submit(taskA);
-
-	starpu_task_wait(taskB);
-
-	FPRINTF(stderr, "{ C, D, E, F } -> { G }\n");
-
-	struct starpu_task *taskC, *taskD, *taskE, *taskF, *taskG;
-
-	taskC = create_dummy_task();
-	taskD = create_dummy_task();
-	taskE = create_dummy_task();
-	taskF = create_dummy_task();
-	taskG = create_dummy_task();
-
-	struct starpu_task *tasksCDEF[4] = {taskC, taskD, taskE, taskF};
-	starpu_task_declare_deps_array(taskG, 4, tasksCDEF);
-
-	starpu_task_submit(taskC);
-	starpu_task_submit(taskD);
-	starpu_task_submit(taskG);
-	starpu_task_submit(taskE);
-	starpu_task_submit(taskF);
-
-	starpu_task_wait(taskG);
-
-	FPRINTF(stderr, "{ H, I } -> { J, K, L }\n");
-
-	struct starpu_task *taskH, *taskI, *taskJ, *taskK, *taskL;
-
-	taskH = create_dummy_task();
-	taskI = create_dummy_task();
-	taskJ = create_dummy_task();
-	taskK = create_dummy_task();
-	taskL = create_dummy_task();
-
-	struct starpu_task *tasksHI[2] = {taskH, taskI};
-
-	starpu_task_declare_deps_array(taskJ, 2, tasksHI);
-	starpu_task_declare_deps_array(taskK, 2, tasksHI);
-	starpu_task_declare_deps_array(taskL, 2, tasksHI);
-
-	starpu_task_submit(taskH);
-	starpu_task_submit(taskI);
-	starpu_task_submit(taskJ);
-	starpu_task_submit(taskK);
-	starpu_task_submit(taskL);
-
-	starpu_task_wait(taskJ);
-	starpu_task_wait(taskK);
-	starpu_task_wait(taskL);
-
-	starpu_shutdown();
-
-	return 0;
-}

+ 8 - 5
tests/datawizard/acquire_cb.c

@@ -15,11 +15,10 @@
  */
 
 #include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#include "../helper.h"
 
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 void callback(void *arg __attribute__ ((unused)))
 {
@@ -29,7 +28,11 @@ void callback(void *arg __attribute__ ((unused)))
 
 int main(int argc, char **argv)
 {
-        starpu_init(NULL);
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
         starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
@@ -41,5 +44,5 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
 }

+ 43 - 25
tests/datawizard/acquire_cb_insert.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,30 +14,38 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <starpu.h>
+#include "../helper.h"
+
+#warning memory leak
 
 #define N 16
 #define M 4
 #define X 2
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
 void which_index_cpu(void *descr[], void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	/* A real case would actually compute something */
 	*x0 = X;
 }
 
-starpu_codelet which_index = {
+struct starpu_codelet which_index =
+{
 	.where = STARPU_CPU,
-	.cpu_func = which_index_cpu,
-        .nbuffers = 1
+	.cpu_funcs = {which_index_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_W}
 };
 
 void work_cpu(void *descr[], void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	int i, n = STARPU_VECTOR_GET_NX(descr[0]);
 	float *x0 = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
@@ -45,16 +53,19 @@ void work_cpu(void *descr[], void *_args)
 		x0[i] = i + 1;
 }
 
-starpu_codelet work = {
+struct starpu_codelet work =
+{
 	.where = STARPU_CPU,
-	.cpu_func = work_cpu,
-        .nbuffers = 1
+	.cpu_funcs = {work_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_W}
 };
 
 static int x;
-static starpu_data_handle x_handle, f_handle;
+static starpu_data_handle_t x_handle, f_handle;
 
-void callback(void *arg) {
+void callback(void *arg)
+{
 	starpu_insert_task(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
 	starpu_data_release(x_handle);
 }
@@ -64,18 +75,22 @@ int main(int argc, char **argv)
         int i, ret;
 	float *f;
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare x */
 	starpu_variable_data_register(&x_handle, 0, (uintptr_t)&x, sizeof(x));
 
 	/* Allocate and Declare f */
-	starpu_malloc((void**)&f, N * sizeof(*f));
+	ret = starpu_malloc((void**)&f, N * sizeof(*f));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 	memset(f, 0, N * sizeof(*f));
 	starpu_vector_data_register(&f_handle, 0, (uintptr_t)f, N, sizeof(*f));
 
 	/* Partition f */
-	struct starpu_data_filter filter = {
+	struct starpu_data_filter filter =
+	{
 		.filter_func = starpu_block_filter_func_vector,
 		.nchildren = M,
 	};
@@ -84,6 +99,7 @@ int main(int argc, char **argv)
 	/* Compute which portion we will work on */
         ret = starpu_insert_task(&which_index, STARPU_W, x_handle, 0);
 	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 	/* And submit the corresponding task */
 #ifdef __GCC__
@@ -96,30 +112,32 @@ int main(int argc, char **argv)
 	starpu_data_acquire_cb(x_handle, STARPU_W, callback, NULL);
 #endif
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	starpu_data_unpartition(f_handle, 0);
 	starpu_data_unregister(f_handle);
 	starpu_data_unregister(x_handle);
 
         FPRINTF(stderr, "VALUES: %d", x);
-
-        for(i=0 ; i<N ; i++) {
+        for(i=0 ; i<N ; i++)
+	{
 		FPRINTF(stderr, " %f", f[i]);
         }
-
-	STARPU_ASSERT(f[X*(N/M)] == 1);
-	STARPU_ASSERT(f[X*(N/M)+1] == 2);
-	STARPU_ASSERT(f[X*(N/M)+2] == 3);
-	STARPU_ASSERT(f[X*(N/M)+3] == 4);
-
 	FPRINTF(stderr, "\n");
 
+	ret = EXIT_SUCCESS;
+	if (f[X*(N/M)] != 1 || f[X*(N/M)+1] != 2 ||
+	    f[X*(N/M)+2] != 3 || f[X*(N/M)+3] != 4)
+		ret = EXIT_FAILURE;
+
+	starpu_free(f);
 	starpu_shutdown();
-	return 0;
+	STARPU_RETURN(ret);
 
 enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 46 - 18
tests/datawizard/acquire_release.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,11 +15,15 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <starpu.h>
+#include "../helper.h"
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
+#ifdef STARPU_SLOW_MACHINE
+static unsigned ntasks = 10;
+#else
 static unsigned ntasks = 10000;
+#endif
 
 #ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
@@ -27,30 +31,35 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	(*tokenptr)++;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
         .where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = increment_cpu,
+	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
 	.nbuffers = 1
 };
 
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
-void increment_token()
+int increment_token()
 {
+	int ret;
 	struct starpu_task *task = starpu_task_create();
         task->synchronous = 1;
 	task->cl = &increment_cl;
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-	starpu_task_submit(task);
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
 }
 
 void callback(void *arg __attribute__ ((unused)))
@@ -61,8 +70,12 @@ void callback(void *arg __attribute__ ((unused)))
 int main(int argc, char **argv)
 {
 	int i;
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-        starpu_init(NULL);
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 
         FPRINTF(stderr, "Token: %u\n", token);
@@ -70,21 +83,36 @@ int main(int argc, char **argv)
 	for(i=0; i<ntasks; i++)
 	{
 		/* synchronize data in RAM */
-                starpu_data_acquire(token_handle, STARPU_R);
+                ret = starpu_data_acquire(token_handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
                 token ++;
                 starpu_data_release(token_handle);
 
-                increment_token();
+                ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-                starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+                ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 	}
 
 	starpu_data_unregister(token_handle);
 
+	starpu_shutdown();
+
         FPRINTF(stderr, "Token: %u\n", token);
-        STARPU_ASSERT(token==ntasks*2);
+	if (token == ntasks * 2)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	STARPU_RETURN(ret);
 
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
 	starpu_shutdown();
-
-	return 0;
+	return STARPU_TEST_SKIPPED;
 }

+ 42 - 16
tests/datawizard/acquire_release2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,9 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <starpu.h>
+#include "../helper.h"
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#warning memory leak
 
 static unsigned ntasks = 40000;
 
@@ -26,30 +28,33 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	(*tokenptr)++;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
         .where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = increment_cpu,
+	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
 	.nbuffers = 1
 };
 
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
-void increment_token(int synchronous)
+int increment_token(int synchronous)
 {
 	struct starpu_task *task = starpu_task_create();
         task->synchronous = synchronous;
 	task->cl = &increment_cl;
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-	starpu_task_submit(task);
+	task->handles[0] = token_handle;
+	return starpu_task_submit(task);
 }
 
 void callback(void *arg __attribute__ ((unused)))
@@ -62,8 +67,12 @@ void callback(void *arg __attribute__ ((unused)))
 int main(int argc, char **argv)
 {
 	int i;
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-        starpu_init(NULL);
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 
         FPRINTF(stderr, "Token: %u\n", token);
@@ -74,16 +83,33 @@ int main(int argc, char **argv)
 
 	for(i=0; i<ntasks; i++)
 	{
-                starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
-                increment_token(0);
+                ret = starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+                ret = increment_token(0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
                 starpu_data_acquire_cb(token_handle, STARPU_R, callback, NULL);  // send
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 	}
 
 	starpu_data_unregister(token_handle);
-        FPRINTF(stderr, "Token: %u\n", token);
-        assert(token==ntasks);
 
 	starpu_shutdown();
 
-	return 0;
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	STARPU_RETURN(ret);
+
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 34 - 30
tests/datawizard/copy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,8 +16,7 @@
  */
 
 #include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#include "../helper.h"
 
 static unsigned nloops = 1000;
 
@@ -25,37 +24,41 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 {
 }
 
-static starpu_codelet cpu_codelet =
+static struct starpu_codelet cpu_codelet =
 {
         .where = STARPU_CPU,
-        .cpu_func = dummy_func,
+        .cpu_funcs = {dummy_func, NULL},
         .model = NULL,
-        .nbuffers = 1
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
-static starpu_codelet gpu_codelet =
+static struct starpu_codelet gpu_codelet =
 {
         .where = STARPU_CUDA|STARPU_OPENCL,
-        .cuda_func = dummy_func,
-        .opencl_func = dummy_func,
+        .cuda_funcs = {dummy_func, NULL},
+        .opencl_funcs = {dummy_func, NULL},
         .model = NULL,
-        .nbuffers = 1
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 
 int main(int argc, char **argv)
 {
         float foo;
-	starpu_data_handle float_array_handle;
-        int i;
+	starpu_data_handle_t float_array_handle;
+        int i, ret;
 
-        starpu_init(NULL);
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
 	{
 		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
 		starpu_shutdown();
-		return 77;
+		return STARPU_TEST_SKIPPED;
 	}
 
         foo = 0.0f;
@@ -71,32 +74,33 @@ int main(int argc, char **argv)
 
 		task_cpu->cl = &cpu_codelet;
 		task_cpu->callback_func = NULL;
-		task_cpu->buffers[0].handle = float_array_handle;
-		task_cpu->buffers[0].mode = STARPU_RW;
+		task_cpu->handles[0] = float_array_handle;
 
 		task_gpu->cl = &gpu_codelet;
 		task_gpu->callback_func = NULL;
-		task_gpu->buffers[0].handle = float_array_handle;
-		task_gpu->buffers[0].mode = STARPU_RW;
+		task_gpu->handles[0] = float_array_handle;
 
 		ret = starpu_task_submit(task_cpu);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			FPRINTF(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 		ret = starpu_task_submit(task_gpu);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			FPRINTF(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
         }
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	starpu_data_unregister(float_array_handle);
         starpu_shutdown();
 
-        return 0;
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(float_array_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 24 - 15
tests/datawizard/critical_section_with_void_interface.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,29 +15,35 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
 #include <starpu.h>
 #include <stdlib.h>
+#include "../helper.h"
 
-starpu_data_handle void_handle;
+starpu_data_handle_t void_handle;
 
 int critical_var;
 
 static void critical_section(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	/* We do not protect this variable because it is only accessed when the
 	 * "void_handle" piece of data is accessed. */
 	critical_var++;
 }
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = critical_section,
-	.cuda_func = critical_section,
-	.opencl_func = critical_section,
-	.nbuffers = 1
+	.cpu_funcs = {critical_section, NULL},
+	.cuda_funcs = {critical_section, NULL},
+	.opencl_funcs = {critical_section, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 int main(int argc, char **argv)
@@ -48,7 +55,9 @@ int main(int argc, char **argv)
 	ntasks /= 10;
 #endif
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	critical_var = 0;
 
@@ -59,13 +68,12 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
-			task->cl = &cl;
-			task->buffers[0].handle = void_handle;
-			task->buffers[0].mode = STARPU_RW;
-	
+		task->cl = &cl;
+		task->handles[0] = void_handle;
+
 		ret = starpu_task_submit(task);
-		if (ret == -ENODEV)
-			goto enodev;
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	starpu_data_unregister(void_handle);
@@ -74,11 +82,12 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
 
 enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 66 - 32
tests/datawizard/data_implicit_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,61 +15,78 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
 #include <starpu.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 #define VECTORSIZE	1024
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 static unsigned *A, *B, *C, *D;
-starpu_data_handle A_handle, B_handle, C_handle, D_handle;
+starpu_data_handle_t A_handle, B_handle, C_handle, D_handle;
 
 static unsigned var = 0;
 
 static void f(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	usleep(200000);
 }
 
-static starpu_codelet cl_f = {
+static struct starpu_codelet cl_f =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = f,
-	.cuda_func = f,
+	.cpu_funcs = {f, NULL},
+	.cuda_funcs = {f, NULL},
 	.nbuffers = 2
 };
 
 static void g(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	usleep(100000);
 	var = 42;
 }
 
-static starpu_codelet cl_g = {
+static struct starpu_codelet cl_g =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = g,
-	.cuda_func = g,
+	.cpu_funcs = {g, NULL},
+	.cuda_funcs = {g, NULL},
 	.nbuffers = 2
 };
 
 static void h(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	FPRINTF(stderr, "VAR %u (should be 42)\n", var);
 	STARPU_ASSERT(var == 42);
 }
 
-static starpu_codelet cl_h = {
+static struct starpu_codelet cl_h =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = h,
-	.cuda_func = h,
+	.cpu_funcs = {h, NULL},
+	.cuda_funcs = {h, NULL},
 	.nbuffers = 2
 };
 
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	A = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
 	B = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
@@ -81,12 +98,12 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&C_handle, 0, (uintptr_t)C, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&D_handle, 0, (uintptr_t)D, VECTORSIZE, sizeof(unsigned));
 
-	#if 0
+#if 0
 	starpu_data_set_sequential_consistency_flag(A_handle, 0);
 	starpu_data_set_sequential_consistency_flag(B_handle, 0);
 	starpu_data_set_sequential_consistency_flag(C_handle, 0);
 	starpu_data_set_sequential_consistency_flag(D_handle, 0);
-	#endif
+#endif
 
 	/* 	f(Ar, Brw): sleep 
 	 *	g(Br; Crw); sleep, var = 42
@@ -94,29 +111,35 @@ int main(int argc, char **argv)
 	 */
 	struct starpu_task *task_f = starpu_task_create();
 	task_f->cl = &cl_f;
-	task_f->buffers[0].handle = A_handle;
-	task_f->buffers[0].mode = STARPU_R;
-	task_f->buffers[1].handle = B_handle;
-	task_f->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_f);
+	task_f->handles[0] = A_handle;
+	task_f->handles[1] = B_handle;
+	ret = starpu_task_submit(task_f);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	struct starpu_task *task_g = starpu_task_create();
 	task_g->cl = &cl_g;
-	task_g->buffers[0].handle = B_handle;
-	task_g->buffers[0].mode = STARPU_R;
-	task_g->buffers[1].handle = C_handle;
-	task_g->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_g);
+	task_g->handles[0] = B_handle;
+	task_g->handles[1] = C_handle;
+	ret = starpu_task_submit(task_g);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	struct starpu_task *task_h = starpu_task_create();
 	task_h->cl = &cl_h;
-	task_h->buffers[0].handle = C_handle;
-	task_h->buffers[0].mode = STARPU_R;
-	task_h->buffers[1].handle = D_handle;
-	task_h->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_h);
+	task_h->handles[0] = C_handle;
+	task_h->handles[1] = D_handle;
+	ret = starpu_task_submit(task_h);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+	starpu_data_unregister(D_handle);
 
 	free(A);
 	free(B);
@@ -125,5 +148,16 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	free(A);
+	free(B);
+	free(C);
+	free(D);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 46 - 30
tests/datawizard/data_invalidation.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,18 +15,23 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
 #include <starpu.h>
 #include <starpu_cuda.h>
 #include <stdlib.h>
+#include "../helper.h"
 
+#ifdef STARPU_SLOW_MACHINE
+#define NLOOPS		100
+#else
 #define NLOOPS		1000
+#endif
 #define VECTORSIZE	1024
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
-static starpu_data_handle v_handle;
+static starpu_data_handle_t v_handle;
 
 /*
  *	Memset
@@ -34,6 +40,8 @@ static starpu_data_handle v_handle;
 #ifdef STARPU_USE_CUDA
 static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
@@ -44,19 +52,23 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 
 static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	memset(buf, 42, length);
 }
 
-static starpu_codelet memset_cl = {
+static struct starpu_codelet memset_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = cpu_memset_codelet,
+	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cuda_memset_codelet,
+	.cuda_funcs = {cuda_memset_codelet, NULL},
 #endif
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 
 /*
@@ -65,6 +77,8 @@ static starpu_codelet memset_cl = {
 
 static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
@@ -79,10 +93,12 @@ static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) vo
 	}
 }
 
-static starpu_codelet check_content_cl = {
+static struct starpu_codelet check_content_cl =
+{
 	.where = STARPU_CPU,
-	.cpu_func = cpu_check_content_codelet,
-	.nbuffers = 1
+	.cpu_funcs = {cpu_check_content_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
 };
 
 
@@ -90,7 +106,9 @@ int main(int argc, char **argv)
 {
 	int ret;
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* The buffer should never be explicitely allocated */
 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
@@ -103,31 +121,27 @@ int main(int argc, char **argv)
 
 		memset_task = starpu_task_create();
 		memset_task->cl = &memset_cl;
-		memset_task->buffers[0].handle = v_handle;
-		memset_task->buffers[0].mode = STARPU_W;
+		memset_task->handles[0] = v_handle;
 		memset_task->detach = 0;
-	
+
 		ret = starpu_task_submit(memset_task);
-		if (ret == -ENODEV)
-				goto enodev;
-	
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
 		ret = starpu_task_wait(memset_task);
-		if (ret)
-			exit(-1);
-		
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
+
 		check_content_task = starpu_task_create();
 		check_content_task->cl = &check_content_cl;
-		check_content_task->buffers[0].handle = v_handle;
-		check_content_task->buffers[0].mode = STARPU_R;
+		check_content_task->handles[0] = v_handle;
 		check_content_task->detach = 0;
-	
+
 		ret = starpu_task_submit(check_content_task);
-		if (ret == -ENODEV)
-				goto enodev;
-	
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
 		ret = starpu_task_wait(check_content_task);
-		if (ret)
-			exit(-1);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 
 		starpu_data_invalidate(v_handle);
 	}
@@ -137,11 +151,13 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
 
 enodev:
+	starpu_data_unregister(v_handle);
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 48 - 39
tests/datawizard/data_lookup.c

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include "../helper.h"
 
 static void task(void **buffers, void *args)
 {
@@ -27,17 +28,19 @@ static void task(void **buffers, void *args)
 	size_t size, i;
 
 	numbers = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 	for(i = 0; i < size; i++)
 	{
 		numbers[i] = i;
 	}
 }
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU,
-	.cpu_func = task,
-	.nbuffers = 1
+	.cpu_funcs = {task, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 
 static int test_lazy_allocation()
@@ -46,7 +49,7 @@ static int test_lazy_allocation()
 
 	size_t i;
 	void *pointer;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	int ret;
 
 	/* Lazily-allocated vector.  */
@@ -58,28 +61,31 @@ static int test_lazy_allocation()
 				 STARPU_VALUE, &count, sizeof(size_t),
 				 0);
 	if (ret == -ENODEV) return ret;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+
 	/* yes, we do not perform the computation but we did detect that no one
 	 * could perform the kernel, so this is not an error from StarPU */
 
 	/* Acquire the handle, forcing a local allocation.  */
-	starpu_data_acquire(handle, STARPU_R);
+	ret = starpu_data_acquire(handle, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
 
 	/* Make sure we have a local pointer to it.  */
 	pointer = starpu_handle_get_local_ptr(handle);
-	assert(pointer != NULL);
+	STARPU_ASSERT(pointer != NULL);
 	for(i = 0; i < count; i++)
 	{
 		float *numbers = (float *)pointer;
-		assert(numbers[i] == i);
+		STARPU_ASSERT(numbers[i] == i);
 	}
 
 	/* Make sure the pointer/handle mapping is up-to-date.  */
-	assert(starpu_data_lookup(pointer) == handle);
+	STARPU_ASSERT(starpu_data_lookup(pointer) == handle);
 
 	starpu_data_release(handle);
 	starpu_data_unregister(handle);
 
-	assert(starpu_data_lookup(pointer) == NULL);
+	STARPU_ASSERT(starpu_data_lookup(pointer) == NULL);
 	return 0;
 }
 
@@ -91,12 +97,12 @@ static int test_lazy_allocation()
 static void test_filters()
 {
 #define CHILDREN_COUNT 10
-	int err, i;
+	int ret, i;
 	int *ptr, *children_pointers[CHILDREN_COUNT];
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 
-	err = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
-	assert(err == 0);
+	ret = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 
 	starpu_vector_data_register(&handle, 0, (uintptr_t)ptr,
 				    VECTOR_SIZE, sizeof(*ptr));
@@ -107,18 +113,18 @@ static void test_filters()
 		.nchildren = CHILDREN_COUNT
 	};
 	starpu_data_partition(handle, &f);
-	assert(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
+	STARPU_ASSERT(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
 
 	for (i = 0; i < CHILDREN_COUNT; i++)
 	{
-                starpu_data_handle child;
+                starpu_data_handle_t child;
 
 		child = starpu_data_get_sub_data(handle, 1, i);
 		children_pointers[i] = (int *) starpu_handle_get_local_ptr(child);
-		assert(children_pointers[i] != NULL);
+		STARPU_ASSERT(children_pointers[i] != NULL);
 
 		/* Make sure we have a pointer -> handle mapping for CHILD.  */
-		assert(starpu_data_lookup(children_pointers[i]) == child);
+		STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == child);
 	}
 
 	starpu_data_unpartition(handle, 0);
@@ -127,11 +133,11 @@ static void test_filters()
 	{
 		if (children_pointers[i] != ptr)
 			/* Make sure the pointer -> handle mapping is gone.  */
-			assert(starpu_data_lookup(children_pointers[i]) == NULL);
+			STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == NULL);
 	}
 
 	/* Make sure the parent's mapping is back.  */
-	assert(starpu_data_lookup(ptr) == handle);
+	STARPU_ASSERT(starpu_data_lookup(ptr) == handle);
 
 	starpu_data_unregister(handle);
 	starpu_free(ptr);
@@ -141,20 +147,22 @@ static void test_filters()
 
 int main(int argc, char *argv[])
 {
-	int err;
+	int ret;
 	size_t i;
 	void *vectors[VECTOR_COUNT], *variables[VARIABLE_COUNT];
-	starpu_data_handle vector_handles[VECTOR_COUNT];
-	starpu_data_handle variable_handles[VARIABLE_COUNT];
+	starpu_data_handle_t vector_handles[VECTOR_COUNT];
+	starpu_data_handle_t variable_handles[VARIABLE_COUNT];
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Register data regions.  */
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
-		err = starpu_malloc(&variables[i], sizeof(float));
-		assert(err == 0);
+		ret = starpu_malloc(&variables[i], sizeof(float));
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 		starpu_variable_data_register(&variable_handles[i], 0,
 					      (uintptr_t)variables[i],
 					      sizeof(float));
@@ -162,8 +170,8 @@ int main(int argc, char *argv[])
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
-		err = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
-		assert(err == 0);
+		ret = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 		starpu_vector_data_register(&vector_handles[i], 0,
 					    (uintptr_t)vectors[i],
 					    VECTOR_SIZE, sizeof(float));
@@ -173,18 +181,18 @@ int main(int argc, char *argv[])
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 		handle = starpu_data_lookup(variables[i]);
-		assert(handle == variable_handles[i]);
+		STARPU_ASSERT(handle == variable_handles[i]);
 	}
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 		handle = starpu_data_lookup(vectors[i]);
-		assert(handle == vector_handles[i]);
+		STARPU_ASSERT(handle == vector_handles[i]);
 	}
 
 	/* Unregister them.  */
@@ -203,24 +211,24 @@ int main(int argc, char *argv[])
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 		handle = starpu_data_lookup(variables[i]);
-		assert(handle == NULL);
+		STARPU_ASSERT(handle == NULL);
 		starpu_free(variables[i]);
 	}
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 		handle = starpu_data_lookup(vectors[i]);
-		assert(handle == NULL);
+		STARPU_ASSERT(handle == NULL);
 		starpu_free(vectors[i]);
 	}
 
-	err = test_lazy_allocation();
-	if (err == -ENODEV) goto enodev;
+	ret = test_lazy_allocation();
+	if (ret == -ENODEV) goto enodev;
 	test_filters();
 
 	starpu_shutdown();
@@ -231,5 +239,6 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 35 - 18
tests/datawizard/dining_philosophers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,28 +16,29 @@
  */
 
 #include <starpu.h>
+#include "../helper.h"
 
 /* number of philosophers */
 #define N	16
 
-starpu_data_handle fork_handles[N];
+starpu_data_handle_t fork_handles[N];
 unsigned forks[N];
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
 static void eat_kernel(void *descr[], void *arg)
 {
 }
 
-static starpu_codelet eating_cl = {
+static struct starpu_codelet eating_cl =
+{
+	.modes = { STARPU_RW, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cuda_func = eat_kernel,
-	.cpu_func = eat_kernel,
-        .opencl_func = eat_kernel,
+	.cuda_funcs = {eat_kernel, NULL},
+	.cpu_funcs = {eat_kernel, NULL},
+        .opencl_funcs = {eat_kernel, NULL},
 	.nbuffers = 2
 };
 
-void submit_one_task(unsigned p)
+int submit_one_task(unsigned p)
 {
 	struct starpu_task *task = starpu_task_create();
 
@@ -46,18 +47,20 @@ void submit_one_task(unsigned p)
 	unsigned left = p;
 	unsigned right = (p+1)%N;
 
-	task->buffers[0].handle = fork_handles[left];
-	task->buffers[0].mode = STARPU_RW;
-	task->buffers[1].handle = fork_handles[right];
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = fork_handles[left];
+	task->handles[1] = fork_handles[right];
 
 	int ret = starpu_task_submit(task);
-	STARPU_ASSERT(!ret);
+	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* initialize the forks */
 	unsigned f;
@@ -75,10 +78,13 @@ int main(int argc, char **argv)
 	{
 		/* select one philosopher randomly */
 		unsigned philosopher = rand() % N;
-		submit_one_task(philosopher);
+		ret = submit_one_task(philosopher);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 	FPRINTF(stderr, "waiting done\n");
 	for (f = 0; f < N; f++)
@@ -88,5 +94,16 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	for (f = 0; f < N; f++)
+	{
+		starpu_data_unregister(fork_handles[f]);
+	}
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }

+ 174 - 0
tests/datawizard/double_parameter.c

@@ -0,0 +1,174 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static struct starpu_codelet codelet_R_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet codelet_R_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet codelet_R_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW}
+};
+
+static struct starpu_codelet codelet_W_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet codelet_W_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet codelet_W_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_RW}
+};
+
+static struct starpu_codelet codelet_RW_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+static struct starpu_codelet codelet_RW_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_W}
+};
+
+static struct starpu_codelet codelet_RW_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	float foo = 0.0f;
+	starpu_data_handle_t handle;
+	int ret;
+	struct starpu_task *task;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, 0, (uintptr_t)&foo, sizeof(foo));
+
+#define SUBMIT(mode0, mode1) \
+	{ \
+		task = starpu_task_create();	\
+		task->handles[0] = handle;	\
+		task->handles[1] = handle;		 \
+		enum starpu_access_mode smode0 = STARPU_##mode0;	\
+		enum starpu_access_mode smode1 = STARPU_##mode0;	\
+		if      (smode0 == STARPU_R && smode1 == STARPU_R)	\
+			task->cl = &codelet_R_R;			\
+		else if (smode0 == STARPU_R && smode1 == STARPU_W)	\
+			task->cl = &codelet_R_W;			\
+		else if (smode0 == STARPU_R && smode1 == STARPU_RW)	\
+			task->cl = &codelet_R_RW;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_R)	\
+			task->cl = &codelet_W_R;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_W)	\
+			task->cl = &codelet_W_W;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_RW)	\
+			task->cl = &codelet_W_RW;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_R)	\
+			task->cl = &codelet_RW_R;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_W)	\
+			task->cl = &codelet_RW_W;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_RW)	\
+			task->cl = &codelet_RW_RW;			\
+									\
+		ret = starpu_task_submit(task);				\
+		if (ret == -ENODEV) goto enodev;			\
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");   \
+	}
+
+	SUBMIT(R,R);
+	SUBMIT(R,W);
+	SUBMIT(R,RW);
+	SUBMIT(W,R);
+	SUBMIT(W,W);
+	SUBMIT(W,RW);
+	SUBMIT(RW,R);
+	SUBMIT(RW,W);
+	SUBMIT(RW,RW);
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 142 - 28
tests/datawizard/dsm_stress.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,6 +21,7 @@
 #include <starpu.h>
 #include <stdlib.h>
 #include <pthread.h>
+#include "../helper.h"
 
 #define N	10000
 
@@ -33,7 +34,7 @@ static unsigned finished = 0;
 
 static unsigned cnt = N;
 
-starpu_data_handle v_handle, v_handle2;
+starpu_data_handle_t v_handle, v_handle2;
 static unsigned *v;
 static unsigned *v2;
 
@@ -43,10 +44,10 @@ static void callback(void *arg)
 
 	if (res == 0)
 	{
-		pthread_mutex_lock(&mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		finished = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
 }
 
@@ -64,11 +65,12 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 {
 }
 
-static starpu_access_mode select_random_mode(void)
+static enum starpu_access_mode select_random_mode(void)
 {
 	int r = rand();
 
-	switch (r % 3) {
+	switch (r % 3)
+	{
 		case 0:
 			return STARPU_R;
 		case 1:
@@ -79,22 +81,109 @@ static starpu_access_mode select_random_mode(void)
 	return STARPU_RW;
 }
 
+static struct starpu_codelet cl_r_r =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet cl_r_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet cl_r_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW}
+};
+
+static struct starpu_codelet cl_w_r =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet cl_w_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet cl_w_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_RW}
+};
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl_rw_r =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = cpu_codelet_null,
-	.cuda_func = cuda_codelet_null,
-        .opencl_func = opencl_codelet_null,
-	.nbuffers = 2
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+static struct starpu_codelet cl_rw_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_W}
+};
+
+static struct starpu_codelet cl_rw_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
 };
 
 
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
-	starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
+	ret = starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
+	ret = starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 
 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&v_handle2, 0, (uintptr_t)v2, VECTORSIZE, sizeof(unsigned));
@@ -103,36 +192,61 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < N; iter++)
 	{
 		struct starpu_task *task = starpu_task_create();
-		task->cl = &cl;
 
-		task->buffers[0].handle = v_handle;
-		task->buffers[0].mode = select_random_mode();
-
-		task->buffers[1].handle = v_handle2;
-		task->buffers[1].mode = select_random_mode();
+		task->handles[0] = v_handle;
+		task->handles[1] = v_handle2;
+
+		enum starpu_access_mode mode0 = select_random_mode();
+		enum starpu_access_mode mode1 = select_random_mode();
+
+		if (mode0 == STARPU_R && mode1 == STARPU_R)
+			task->cl = &cl_r_r;
+		else if (mode0 == STARPU_R && mode1 == STARPU_W)
+			task->cl = &cl_r_w;
+		else if (mode0 == STARPU_R && mode1 == STARPU_RW)
+			task->cl = &cl_r_rw;
+		else if (mode0 == STARPU_W && mode1 == STARPU_R)
+			task->cl = &cl_w_r;
+		else if (mode0 == STARPU_W && mode1 == STARPU_W)
+			task->cl = &cl_w_w;
+		else if (mode0 == STARPU_W && mode1 == STARPU_RW)
+			task->cl = &cl_w_rw;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_R)
+			task->cl = &cl_rw_r;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_W)
+			task->cl = &cl_rw_w;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_RW)
+			task->cl = &cl_rw_rw;
 
 		task->callback_func = callback;
 		task->callback_arg = NULL;
 
 		int ret = starpu_task_submit(task);
-		if (ret == -ENODEV)
-			goto enodev;
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
-	pthread_mutex_lock(&mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
-		pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
+	starpu_data_unregister(v_handle);
+	starpu_data_unregister(v_handle2);
 	starpu_free(v);
 	starpu_free(v2);
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
 
 enodev:
+	starpu_data_unregister(v_handle);
+	starpu_data_unregister(v_handle2);
+	starpu_free(v);
+	starpu_free(v2);
+	starpu_shutdown();
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	return STARPU_TEST_SKIPPED;
 }

+ 139 - 0
tests/datawizard/gpu_register.c

@@ -0,0 +1,139 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <starpu_cuda.h>
+#include "../helper.h"
+#include "scal.h"
+
+int main(int argc, char **argv)
+{
+	int ret;
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000
+	unsigned *foo_gpu;
+	unsigned *foo;
+	starpu_data_handle_t handle;
+	int n, i, size, pieces;
+	int devid;
+	unsigned workerid;
+	int chosen = -1;
+	cudaError_t cures;
+#endif
+#endif
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+	/* TODO OpenCL, too */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER) {
+			chosen = workerid;
+			break;
+		}
+	}
+
+	if (chosen == -1)
+		return STARPU_TEST_SKIPPED;
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	devid = starpu_worker_get_devid(chosen);
+	cudaSetDevice(devid);
+	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+
+	foo = calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	cures = cudaMemcpy(foo_gpu, foo, size * sizeof(*foo_gpu), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	starpu_vector_data_register(&handle, starpu_worker_get_memory_node(chosen), (uintptr_t)foo_gpu, size, sizeof(*foo_gpu));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	/* Even with just one worker, split in at least two */
+	if (n == 1)
+		pieces = 2;
+	else
+		pieces = n;
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = pieces,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	for (i = 0; i < pieces; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i%n;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
+	starpu_data_unregister(handle);
+
+	cudaSetDevice(devid);
+	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	starpu_shutdown();
+
+	for (i = 0; i < size; i++) {
+		if (foo[i] != i*2) {
+			fprintf(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
+			return EXIT_FAILURE;
+		}
+	}
+
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+#endif
+#endif
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 23 - 17
tests/datawizard/handle_to_pointer.c

@@ -19,6 +19,7 @@
 
 #include <starpu.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 static void cpu_task(void **buffers, void *args)
 {
@@ -27,7 +28,7 @@ static void cpu_task(void **buffers, void *args)
 	size_t size;
 
 	numbers = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 
 	for(i = 0; i < size; i++)
 	{
@@ -43,7 +44,7 @@ static void cuda_task(void **buffers, void *args)
 	size_t size;
 
 	numbers = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 
 	for(i = 0; i < size; i++)
 	{
@@ -52,41 +53,44 @@ static void cuda_task(void **buffers, void *args)
 }
 #endif
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU | STARPU_CUDA,
-	.cpu_func = cpu_task,
+	.cpu_funcs = {cpu_task, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cuda_task,
+	.cuda_funcs = {cuda_task, NULL},
 #endif
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 
 int main(int argc, char *argv[])
 {
-	int err;
+	int err, ret;
 	size_t i;
 	int *pointer;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	static const size_t count = 123;
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 
 	err = starpu_malloc((void **)&pointer, count * sizeof(int));
-	assert((err == 0) && (pointer != NULL));
+	STARPU_ASSERT((err == 0) && (pointer != NULL));
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)pointer,
 				      sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 
 	starpu_vector_data_register(&handle, 0, (uintptr_t)pointer,
 				    count, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 
 	starpu_matrix_data_register(&handle, 0, (uintptr_t)pointer, 0,
 				    count, 1, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 
 	starpu_free(pointer);
@@ -95,24 +99,26 @@ int main(int argc, char *argv[])
 	/* Lazy allocation.  */
 	starpu_vector_data_register(&handle, -1, 0 /* NULL */,
 				    count, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == NULL);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == NULL);
 
 	/* Pass the handle to a task.  */
-	starpu_insert_task(&cl,
+	err = starpu_insert_task(&cl,
 			   STARPU_W, handle,
 			   STARPU_VALUE, &count, sizeof(count),
 			   0);
+	if (err == -ENODEV)
+		return STARPU_TEST_SKIPPED;
 
 	/* Acquire the handle, forcing a local allocation.  */
 	starpu_data_acquire(handle, STARPU_R);
 
 	/* Make sure we have a local pointer to it.  */
 	pointer = (int *) starpu_handle_to_pointer(handle, 0);
-	assert(pointer != NULL);
+	STARPU_ASSERT(pointer != NULL);
 	for(i = 0; i < count; i++)
 	{
 		int *numbers = (int *)pointer;
-		assert(numbers[i] == i);
+		STARPU_ASSERT(numbers[i] == i);
 	}
 	starpu_data_release(handle);
 

+ 102 - 0
tests/datawizard/in_place_partition.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../helper.h"
+#include "scal.h"
+
+int main(int argc, char **argv)
+{
+	unsigned *foo;
+	starpu_data_handle_t handle;
+	int ret;
+	int n, i, size;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	n = starpu_worker_get_count();
+	if (n == 1)
+	{
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	size = 10 * n;
+
+	foo = (unsigned *) calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	starpu_vector_data_register(&handle, 0, (uintptr_t)foo, size, sizeof(*foo));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = n,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	for (i = 0; i < f.nchildren; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+	ret = EXIT_SUCCESS;
+	for (i = 0; i < size; i++) {
+		if (foo[i] != i*2) {
+			FPRINTF(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
+			ret = EXIT_FAILURE;
+		}
+	}
+
+        return ret;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 78 - 31
tests/datawizard/increment_redux.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <starpu.h>
+#include "../helper.h"
 
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
@@ -24,9 +26,10 @@
 #include <starpu_opencl.h>
 #endif
 
+#warning memory leak
 
 static unsigned var = 0;
-static starpu_data_handle handle;
+static starpu_data_handle_t handle;
 
 /*
  *	Reduction methods
@@ -35,6 +38,8 @@ static starpu_data_handle handle;
 #ifdef STARPU_USE_CUDA
 static void redux_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
@@ -53,6 +58,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	/* This is a dummy technique of course */
@@ -65,6 +72,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 #ifdef STARPU_USE_OPENCL
 static void redux_opencl_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst, h_src;
 
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -79,18 +88,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
 	h_dst += h_src;
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst = 0;
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cl_command_queue queue;
 	starpu_opencl_get_current_queue(&queue);
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 #endif
 
@@ -98,6 +109,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
 static void redux_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	*dst = *dst + *src;
@@ -105,31 +118,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
 static void neutral_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dst = 0;
 }
 
-static starpu_codelet redux_cl = {
+static struct starpu_codelet redux_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = redux_cuda_kernel,
+	.cuda_funcs = {redux_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = redux_opencl_kernel,
+	.opencl_funcs = {redux_opencl_kernel, NULL},
 #endif
-	.cpu_func = redux_cpu_kernel,
+	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.nbuffers = 2
 };
 
-static starpu_codelet neutral_cl = {
+static struct starpu_codelet neutral_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = neutral_cuda_kernel,
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = neutral_opencl_kernel,
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
 #endif
-	.cpu_func = neutral_cpu_kernel,
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.nbuffers = 1
 };
 
@@ -141,6 +158,8 @@ static starpu_codelet neutral_cl = {
 /* dummy OpenCL implementation */
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned h_token;
 
@@ -149,7 +168,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 }
 #endif
 
@@ -157,6 +176,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 #ifdef STARPU_USE_CUDA
 static void increment_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned host_token;
 
@@ -173,25 +194,33 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
 static void increment_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*tokenptr = *tokenptr + 1;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda_kernel,
+	.cuda_funcs = {increment_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = increment_opencl_kernel,
+	.opencl_funcs = {increment_opencl_kernel, NULL},
 #endif
-	.cpu_func = increment_cpu_kernel,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
 };
 
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 
@@ -208,26 +237,44 @@ int main(int argc, char **argv)
 		for (t = 0; t < ntasks; t++)
 		{
 			struct starpu_task *task = starpu_task_create();
-	
+
 			task->cl = &increment_cl;
-	
-			task->buffers[0].mode = STARPU_REDUX;
-			task->buffers[0].handle = handle;
-	
-			int ret = starpu_task_submit(task);
-			STARPU_ASSERT(!ret);
+			task->handles[0] = handle;
 
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		starpu_data_acquire(handle, STARPU_R);
-		STARPU_ASSERT(var == ntasks*(loop + 1));
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		if (var != ntasks * (loop+1))
+		{
+			starpu_data_release(handle);
+			starpu_data_unregister(handle);
+			goto err;
+		}
 		starpu_data_release(handle);
 	}
 
 	starpu_data_unregister(handle);
-	STARPU_ASSERT(var == ntasks*nloops);
-	
+	if (var != ntasks * nloops)
+		goto err;
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
 	starpu_shutdown();
+	STARPU_RETURN(EXIT_FAILURE);
 
-	return 0;
 }

+ 255 - 0
tests/datawizard/increment_redux_lazy.c

@@ -0,0 +1,255 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+
+#warning memory leak
+
+static starpu_data_handle_t handle;
+
+/*
+ *	Reduction methods
+ */
+
+#ifdef STARPU_USE_CUDA
+static void redux_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	unsigned host_dst, host_src;
+
+	/* This is a dummy technique of course */
+	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	host_dst += host_src;
+
+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+
+static void neutral_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	/* This is a dummy technique of course */
+	unsigned host_dst = 0;
+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void redux_opencl_kernel(void *descr[], void *arg)
+{
+	unsigned h_dst, h_src;
+
+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+	cl_mem d_src = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	/* This is a dummy technique of course */
+	clEnqueueReadBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clEnqueueReadBuffer(queue, d_src, CL_TRUE, 0, sizeof(unsigned), (void *)&h_src, 0, NULL, NULL);
+
+	h_dst += h_src;
+
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+}
+
+static void neutral_opencl_kernel(void *descr[], void *arg)
+{
+	unsigned h_dst = 0;
+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+}
+#endif
+
+
+
+static void redux_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	*dst = *dst + *src;
+}
+
+static void neutral_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*dst = 0;
+}
+
+static struct starpu_codelet redux_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {redux_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {redux_cpu_kernel, NULL},
+	.nbuffers = 2
+};
+
+static struct starpu_codelet neutral_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
+	.nbuffers = 1
+};
+
+/*
+ *	Increment codelet
+ */
+
+#ifdef STARPU_USE_OPENCL
+/* dummy OpenCL implementation */
+static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
+{
+	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned h_token;
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	h_token++;
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+}
+#endif
+
+
+#ifdef STARPU_USE_CUDA
+static void increment_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned host_token;
+
+	/* This is a dummy technique of course */
+	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	host_token++;
+
+	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+#endif
+
+static void increment_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*tokenptr = *tokenptr + 1;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned *var;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, -1, (uintptr_t)NULL, sizeof(unsigned));
+
+	starpu_data_set_reduction_methods(handle, &redux_cl, &neutral_cl);
+
+	unsigned ntasks = 1024;
+	unsigned nloops = 16;
+
+	unsigned loop;
+	unsigned t;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		for (t = 0; t < ntasks; t++)
+		{
+			struct starpu_task *task = starpu_task_create();
+
+			task->cl = &increment_cl;
+			task->handles[0] = handle;
+
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		var = (unsigned*) starpu_variable_get_local_ptr(handle);
+		STARPU_ASSERT(*var == ntasks*(loop + 1));
+		starpu_data_release(handle);
+	}
+
+	ret = starpu_data_acquire(handle, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	var = (unsigned*) starpu_variable_get_local_ptr(handle);
+	STARPU_ASSERT(*var == ntasks*nloops);
+	starpu_data_release(handle);
+	starpu_data_unregister(handle);
+
+	starpu_shutdown();
+
+	STARPU_RETURN(EXIT_SUCCESS);
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	STARPU_RETURN(STARPU_TEST_SKIPPED);
+}

+ 102 - 30
tests/datawizard/increment_redux_v2.c

@@ -14,7 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <config.h>
 #include <starpu.h>
+#include "../helper.h"
 
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
@@ -23,9 +25,10 @@
 #include <starpu_opencl.h>
 #endif
 
+#warning memory leak
 
 static unsigned var = 0;
-static starpu_data_handle handle;
+static starpu_data_handle_t handle;
 
 /*
  *	Reduction methods
@@ -34,6 +37,8 @@ static starpu_data_handle handle;
 #ifdef STARPU_USE_CUDA
 static void redux_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
@@ -52,6 +57,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	/* This is a dummy technique of course */
@@ -64,6 +71,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 #ifdef STARPU_USE_OPENCL
 static void redux_opencl_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst, h_src;
 
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -78,18 +87,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
 	h_dst += h_src;
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst = 0;
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cl_command_queue queue;
 	starpu_opencl_get_current_queue(&queue);
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 #endif
 
@@ -97,6 +108,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
 static void redux_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	*dst = *dst + *src;
@@ -104,31 +117,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
 static void neutral_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dst = 0;
 }
 
-static starpu_codelet redux_cl = {
+static struct starpu_codelet redux_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = redux_cuda_kernel,
+	.cuda_funcs = {redux_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = redux_opencl_kernel,
+	.opencl_funcs = {redux_opencl_kernel, NULL},
 #endif
-	.cpu_func = redux_cpu_kernel,
+	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.nbuffers = 2
 };
 
-static starpu_codelet neutral_cl = {
+static struct starpu_codelet neutral_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = neutral_cuda_kernel,
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = neutral_opencl_kernel,
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
 #endif
-	.cpu_func = neutral_cpu_kernel,
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.nbuffers = 1
 };
 
@@ -140,6 +157,8 @@ static starpu_codelet neutral_cl = {
 /* dummy OpenCL implementation */
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned h_token;
 
@@ -148,7 +167,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 }
 #endif
 
@@ -156,6 +175,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 #ifdef STARPU_USE_CUDA
 static void increment_cuda_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned host_token;
 
@@ -172,25 +193,47 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
 static void increment_cpu_kernel(void *descr[], void *arg)
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*tokenptr = *tokenptr + 1;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda_kernel,
+	.cuda_funcs = {increment_cuda_kernel, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = increment_opencl_kernel,
+	.opencl_funcs = {increment_opencl_kernel, NULL},
 #endif
-	.cpu_func = increment_cpu_kernel,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+static struct starpu_codelet increment_cl_redux =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
 };
 
 int main(int argc, char **argv)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 
@@ -207,26 +250,55 @@ int main(int argc, char **argv)
 		for (t = 0; t < ntasks; t++)
 		{
 			struct starpu_task *task = starpu_task_create();
-	
-			task->cl = &increment_cl;
-	
-			task->buffers[0].mode = (t % 10 == 0)?STARPU_RW:STARPU_REDUX;
-			task->buffers[0].handle = handle;
-	
-			int ret = starpu_task_submit(task);
-			STARPU_ASSERT(!ret);
 
+			if (t % 10 == 0)
+			{
+				task->cl = &increment_cl;
+			}
+			else
+			{
+				task->cl = &increment_cl_redux;
+			}
+			task->handles[0] = handle;
+
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		starpu_data_acquire(handle, STARPU_R);
-		STARPU_ASSERT(var == ntasks*(loop + 1));
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		if (var != ntasks *(loop+1))
+		{
+			_STARPU_DEBUG("%d != %d\n", var, ntasks*(loop+1));
+			starpu_data_release(handle);
+			starpu_data_unregister(handle);
+			goto err;
+		}
 		starpu_data_release(handle);
 	}
 
 	starpu_data_unregister(handle);
-	STARPU_ASSERT(var == ntasks*nloops);
+	if (var != ntasks *nloops)
+	{
+		_STARPU_DEBUG("%d != %d\n", var, ntasks*nloops);
+		goto err;
+	}
 	
+
 	starpu_shutdown();
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
+	starpu_shutdown();
+	STARPU_RETURN(EXIT_FAILURE);
 }

+ 70 - 0
tests/datawizard/interfaces/bcsr/bcsr_cuda.cu

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config bcsr_config;
+
+__global__ void bcsr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= nnz)
+		return;
+
+	if (nzval[i] != i*factor)
+		*err = 1;
+	else
+		nzval[i] = -nzval[i];
+}
+
+extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
+
+	factor = *(int *) args;
+	//val = (int *) starpu_bcsr_get_local_nzval((starpu_data_handle_t)buffers[0]);
+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &bcsr_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
+		(val, nnz, ret, factor);
+
+	error = cudaMemcpy(&bcsr_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 198 - 0
tests/datawizard/interfaces/bcsr/bcsr_interface.c

@@ -0,0 +1,198 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+/*
+ * XXX : These values should not be changed. If you really understand all that
+ * BCSR stuff, feel free to write a better example :)
+ */
+
+/* Size of the matrix */
+#define WIDTH          4
+#define HEIGHT         4
+#define SIZE           (WIDTH * HEIGHT)
+
+/* Size of the blocks */
+#define R              2
+#define C              2
+#define BLOCK_SIZE     (R*C)
+
+/* The matrix is simply 0 1 2... There are SIZE-1 non zero values... */
+#define NNZ            (SIZE-1)
+
+/* ... and SIZE/BLOCK_SIZE non zero blocks */
+#define NNZ_BLOCKS     (SIZE/BLOCK_SIZE)
+
+
+#ifdef STARPU_USE_CPU
+static void test_bcsr_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_bcsr_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_bcsr_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static int nzval[NNZ];
+static int nzval2[NNZ];
+
+static uint32_t colind[NNZ_BLOCKS];
+static uint32_t colind2[NNZ_BLOCKS];
+
+static uint32_t rowptr[1+WIDTH/R];
+static uint32_t rowptr2[1+WIDTH/R];
+
+static starpu_data_handle_t bcsr_handle;
+static starpu_data_handle_t bcsr2_handle;
+
+
+struct test_config bcsr_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_bcsr_cpu_func,
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_bcsr_cuda_func,
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_bcsr_opencl_func,
+#endif /* !STARPU_USE_OPENCL */
+	.handle        = &bcsr_handle,
+	.dummy_handle  = &bcsr2_handle,
+	.copy_failed   = 0,
+	.name          = "bcsr_interface"
+};
+
+static void
+register_data(void)
+{
+	int i;
+
+	for (i = 0; i < NNZ; i++)
+		nzval[i] = i;
+
+	colind[0] = 0;
+	colind[1] = 2;
+	colind[2] = 0;
+	colind[3] = 2;
+
+	rowptr[0] = 0;
+	rowptr[1] = 2;
+	rowptr[2] = 4;
+	
+	starpu_bcsr_data_register(&bcsr_handle,
+				  0,
+				  NNZ_BLOCKS,
+				  HEIGHT/R,
+				  (uintptr_t) nzval,
+				  colind,
+				  rowptr,
+				  0,
+				  R,
+				  C,
+				  sizeof(nzval[0]));
+
+	starpu_bcsr_data_register(&bcsr2_handle,
+				  0,
+				  NNZ_BLOCKS,
+				  HEIGHT/R,
+				  (uintptr_t) nzval2,
+				  colind2,
+				  rowptr2,
+				  0,
+				  R,
+				  C,
+				  sizeof(nzval2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(bcsr_handle);
+	starpu_data_unregister(bcsr2_handle);
+}
+
+static void
+test_bcsr_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nnz; i++)
+	{
+		if (val[i] != i * factor)
+		{
+			bcsr_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+
+	/* Check colind */
+	uint32_t *col = STARPU_BCSR_GET_COLIND(buffers[0]);
+	for (i = 0; i < NNZ_BLOCKS; i++)
+		if (col[i] != colind[i])
+			bcsr_config.copy_failed = 1;
+
+	/* Check rowptr */
+	uint32_t *row = STARPU_BCSR_GET_ROWPTR(buffers[0]);
+	for (i = 0; i < 1 + WIDTH/R; i++)
+		if (row[i] != rowptr[i])
+			bcsr_config.copy_failed = 1;
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+	register_data();
+
+	summary = run_tests(&bcsr_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+}
+

+ 130 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl"
+extern struct test_config bcsr_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_bcsr_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	cl_mem nzval = (cl_mem)STARPU_BCSR_GET_NZVAL(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &bcsr_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"test_bcsr_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(nzval), &nzval,
+					      sizeof(nnz), &nnz,
+					      sizeof(fail), &fail,
+					      sizeof(factor), &factor,
+					      0);
+
+	if (nargs != 4)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", err);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nnz;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &bcsr_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 29 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void test_bcsr_opencl(__global int *val,
+			       unsigned int nx,
+			       __global int *err,
+			       int factor)
+{
+        const int i = get_global_id(0);
+        if (i >=  nx)
+		return;
+
+	if (val[i] != i * factor)
+		*err = 1;
+	else
+		val[i] = - val[i];
+}

+ 80 - 0
tests/datawizard/interfaces/block/block_cuda.cu

@@ -0,0 +1,80 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config block_config;
+
+static __global__ void block_cuda(int *block,
+				  int nx, int ny, int nz,
+				  unsigned ldy, unsigned ldz,
+				  float factor, int *err)
+{
+        int i, j, k;
+	int val = 0;
+
+        for (k = 0; k < nz ;k++)
+	{
+                for (j = 0; j < ny ;j++)
+		{
+                        for(i = 0; i < nx ;i++)
+			{
+				if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					*err = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+                }
+        }
+}
+
+extern "C" void test_block_cuda_func(void *buffers[], void *args)
+{
+	cudaError_t error;
+	int *ret;
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret, &block_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
+	int factor = *(int*) args;
+
+        block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
+		(block, nx, ny, nz, ldy, ldz, factor, ret);
+	error = cudaMemcpy(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 163 - 0
tests/datawizard/interfaces/block/block_interface.c

@@ -0,0 +1,163 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define NX 16
+#define NY NX
+#define NZ NX
+
+/* Prototypes */
+static void register_data(void);
+static void unregister_data(void);
+static void test_block_cpu_func(void *buffers[], void *args);
+#ifdef STARPU_USE_CUDA
+extern void test_block_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_block_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static starpu_data_handle_t block_handle;
+static starpu_data_handle_t block2_handle;
+
+struct test_config block_config =
+{
+	.cpu_func      = test_block_cpu_func,
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_block_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_block_opencl_func,
+#endif
+	.handle        = &block_handle,
+	.dummy_handle  = &block2_handle,
+	.copy_failed   = 0,
+	.name          = "block_interface"
+};
+
+static int block[NX*NY*NZ];
+static int block2[NX*NY*NZ];
+
+static void
+register_data(void)
+{
+	/* Initializing data */
+	int val = 0;
+	int i, j, k;
+	for (k = 0; k < NZ; k++)
+		for (j = 0; j < NY; j++)
+			for (i = 0; i < NX; i++)
+                                block[(k*NX*NY)+(j*NX)+i] = val++;
+
+	/* Registering data */
+	starpu_block_data_register(&block_handle,
+                                    0,
+                                    (uintptr_t)block,
+				    NX,
+				    NX * NY,
+				    NX,
+				    NY,
+				    NZ,
+				    sizeof(block[0]));
+	starpu_block_data_register(&block2_handle,
+                                    0,
+                                    (uintptr_t)block2,
+				    NX,
+				    NX * NY,
+				    NX,
+				    NY,
+				    NZ,
+				    sizeof(block2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(block_handle);
+	starpu_data_unregister(block2_handle);
+}
+
+static void test_block_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int factor = *(int*)args;
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
+	unsigned int i, j, k;
+	int val = 0;
+	block_config.copy_failed = 0;
+	for (k = 0; k < nz; k++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			for (i = 0; i < nx; i++)
+			{
+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					block_config.copy_failed = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+		}
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&block_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}
+

+ 120 - 0
tests/datawizard/interfaces/block/block_opencl.c

@@ -0,0 +1,120 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/block/block_opencl_kernel.cl"
+extern struct test_config block_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_block_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	cl_mem block = (cl_mem) STARPU_BLOCK_GET_DEV_HANDLE(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &block_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"block_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(block), &block,
+					      sizeof(nx), &nx,
+					      sizeof(ny), &ny,
+					      sizeof(nz), &nz,
+					      sizeof(ldy), &ldy,
+					      sizeof(ldz), &ldz,
+					      sizeof(factor), &factor,
+					      sizeof(fail), &fail,
+					      0);
+
+	if (nargs != 8)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", nargs);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nx * ny * nz;
+		err = clEnqueueNDRangeKernel(queue,
+					     kernel,
+					     1,
+					     NULL,
+					     &global,
+					     NULL,
+					     0,
+					     NULL,
+					     &event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &block_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 46 - 0
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -0,0 +1,46 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void block_opencl(__global int *block,
+			   int nx, int ny, int nz,
+			   int ldy, int ldz,
+			   int factor, __global int *err)
+{
+        const int id = get_global_id(0);
+	if (id > 0)
+		return;
+
+	unsigned int i, j, k;
+	int val = 0;
+	for (k = 0; k < nz; k++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			for (i = 0; i < nx; i++)
+			{
+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					*err = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+		}
+	}
+}

+ 106 - 0
tests/datawizard/interfaces/copy_interfaces.c

@@ -0,0 +1,106 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../../helper.h"
+#include <datawizard/coherency.h>
+
+static int check_copy(starpu_data_handle_t handle, char *header)
+{
+	void *old_interface, *new_interface;
+	starpu_data_handle_t new_handle;
+	int ret=0;
+
+	starpu_data_register_same(&new_handle, handle);
+
+	if (!getenv("STARPU_SSILENT") && new_handle->ops->display)
+	{
+		fprintf(stderr, "%s: ", header);
+		new_handle->ops->display(new_handle, stderr);
+		fprintf(stderr, "\n");
+	}
+
+	old_interface = starpu_data_get_interface_on_node(handle, 0);
+	new_interface = starpu_data_get_interface_on_node(new_handle, 0);
+
+	if (new_handle->ops->compare(old_interface, new_interface) == 0)
+	{
+		FPRINTF(stderr, "Error when copying %s data\n", header);
+		assert(0);
+		ret = 1;
+	}
+	starpu_data_unregister(handle);
+	starpu_data_unregister(new_handle);
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	starpu_data_handle_t handle;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		int x=42;
+		starpu_variable_data_register(&handle, 0, (uintptr_t)&x, sizeof(x));
+		ret = check_copy(handle, "variable");
+	}
+
+	if (ret == 0)
+	{
+		int xx[] = {12, 23, 45};
+		starpu_vector_data_register(&handle, 0, (uintptr_t)xx, 3, sizeof(xx[0]));
+		ret = check_copy(handle, "vector");
+	}
+
+	if (ret == 0)
+	{
+		int NX=3;
+		int NY=2;
+		int matrix[NX][NY];
+		starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
+		ret = check_copy(handle, "matrix");
+	}
+
+	if (ret == 0)
+	{
+		int NX=3;
+		int NY=2;
+		int NZ=4;
+		int block[NX*NY*NZ];
+		starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(block[0]));
+		ret = check_copy(handle, "block");
+	}
+
+	if (ret == 0)
+	{
+		uint32_t nnz = 2;
+		unsigned nrow = 5;
+		float nzvalA[20];
+		uint32_t colind[1];
+		uint32_t rowptr[2];
+		starpu_csr_data_register(&handle, 0, nnz, nrow, (uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
+		ret = check_copy(handle, "csr");
+	}
+
+	starpu_shutdown();
+	return ret;
+}
+
+

+ 68 - 0
tests/datawizard/interfaces/csr/csr_cuda.cu

@@ -0,0 +1,68 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config csr_config;
+
+__global__ void csr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= nnz)
+		return;
+
+	if (nzval[i] != (i+1)*factor)
+		*err = 1;
+	else
+		nzval[i] = -nzval[i];
+}
+
+extern "C" void test_csr_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
+
+	factor = *(int *) args;
+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &csr_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
+
+	error = cudaMemcpy(&csr_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 170 - 0
tests/datawizard/interfaces/csr/csr_interface.c

@@ -0,0 +1,170 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define WIDTH  8
+#define HEIGHT 4
+#define SIZE   (WIDTH * HEIGHT)
+#define NNZ    (SIZE-1)
+
+#ifdef STARPU_USE_CPU
+static void test_csr_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_csr_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_csr_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static int nzval[NNZ];
+static int nzval2[NNZ];
+
+static uint32_t colind[NNZ];
+static uint32_t colind2[NNZ];
+
+static uint32_t rowptr[HEIGHT+1];
+static uint32_t rowptr2[HEIGHT+1];
+
+static starpu_data_handle_t csr_handle;
+static starpu_data_handle_t csr2_handle;
+
+struct test_config csr_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_csr_cpu_func,
+#endif /* ! STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_csr_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_csr_opencl_func,
+#endif
+	.handle        = &csr_handle,
+	.dummy_handle  = &csr2_handle,
+	.copy_failed   = 0,
+	.name          = "csr_interface"
+};
+
+static void
+register_data(void)
+{
+	int i;
+	for (i = 1; i < SIZE; i++)
+	{
+		nzval[i-1] = i;
+		nzval2[i-1] = 42;
+
+		colind[i-1] = i % WIDTH;
+		colind2[i-1] = colind[i];
+	}
+
+	rowptr[0] = 1;
+	rowptr2[0] = 1;
+	for (i = 1; i < HEIGHT; i++)
+	{
+		rowptr[i] = i * WIDTH;
+		rowptr2[i] = rowptr[i];
+	}
+	rowptr[HEIGHT] = NNZ + 1;
+	rowptr2[HEIGHT] = rowptr[HEIGHT];
+
+	starpu_csr_data_register(&csr_handle,
+				 0,
+				 NNZ,
+				 HEIGHT,
+				 (uintptr_t) nzval,
+				 colind,
+				 rowptr,
+				 0,
+				 sizeof(nzval[0]));
+	starpu_csr_data_register(&csr2_handle,
+				 0,
+				 NNZ,
+				 HEIGHT,
+				 (uintptr_t) nzval2,
+				 colind2,
+				 rowptr2,
+				 0,
+				 sizeof(nzval2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(csr_handle);
+	starpu_data_unregister(csr2_handle);
+}
+
+static void
+test_csr_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nnz; i++)
+	{
+		if (val[i] != (i+1) * factor)
+		{
+			csr_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&csr_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}

+ 130 - 0
tests/datawizard/interfaces/csr/csr_opencl.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/csr/csr_opencl_kernel.cl"
+extern struct test_config csr_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_csr_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	cl_mem nzval = (cl_mem)STARPU_CSR_GET_NZVAL(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &csr_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"test_csr_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(nzval), &nzval,
+					      sizeof(nnz), &nnz,
+					      sizeof(fail), &fail,
+					      sizeof(factor), &factor,
+					      0);
+
+	if (nargs != 4)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", err);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nnz;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &csr_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 29 - 0
tests/datawizard/interfaces/csr/csr_opencl_kernel.cl

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void test_csr_opencl(__global int *val,
+			      unsigned int nx,
+			      __global int *err,
+			      int factor)
+{
+        const int i = get_global_id(0);
+        if (i >=  nx)
+		return;
+
+	if (val[i] != (i+1) * factor)
+		*err = 1;
+	else
+		val[i] = - val[i];
+}

+ 71 - 0
tests/datawizard/interfaces/matrix/matrix_cuda.cu

@@ -0,0 +1,71 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config matrix_config;
+
+__global__ void matrix_cuda(int *val, unsigned n, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= n)
+		return;
+
+	if (val[i] != i*factor)
+		*err = 1;
+	else
+		val[i] = -val[i];
+}
+
+extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	unsigned int nx, ny, n;
+
+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
+	n = nx * ny;
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+	factor = *(int *) args;
+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &matrix_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+
+	error = cudaMemcpy(&matrix_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 145 - 0
tests/datawizard/interfaces/matrix/matrix_interface.c

@@ -0,0 +1,145 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define WIDTH  16
+#define HEIGHT 16
+
+#ifdef STARPU_USE_CPU
+static void test_matrix_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_matrix_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_matrix_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static starpu_data_handle_t matrix_handle;
+static starpu_data_handle_t matrix2_handle;
+
+struct test_config matrix_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_matrix_cpu_func,
+#endif /* ! STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_matrix_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_matrix_opencl_func,
+#endif
+	.handle        = &matrix_handle,
+	.dummy_handle  = &matrix2_handle,
+	.copy_failed   = 0,
+	.name          = "matrix_interface"
+};
+
+static int matrix[WIDTH * HEIGHT];
+static int matrix2[WIDTH * HEIGHT];
+
+static void
+register_data(void)
+{
+	int i;
+	int size = WIDTH * HEIGHT;
+	for (i = 0; i < size; i++)
+		matrix[i] = i;
+
+	starpu_matrix_data_register(&matrix_handle,
+				    0,
+				    (uintptr_t) matrix,
+				    WIDTH, /* ld */
+				    WIDTH,
+				    HEIGHT,
+				    sizeof(matrix[0]));
+	starpu_matrix_data_register(&matrix2_handle,
+				    0,
+				    (uintptr_t) matrix2,
+				    WIDTH, /* ld */
+				    WIDTH,
+				    HEIGHT,
+				    sizeof(matrix[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(matrix_handle);
+	starpu_data_unregister(matrix2_handle);
+}
+
+static void
+test_matrix_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+	unsigned int nx, ny;
+
+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nx*ny; i++)
+	{
+		if (val[i] != i * factor)
+		{
+			matrix_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&matrix_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}

+ 129 - 0
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -0,0 +1,129 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl"
+
+extern struct test_config matrix_config;
+static struct starpu_opencl_program matrix_program;
+
+void test_matrix_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, factor, ret;
+	unsigned int n;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+	cl_context         context;
+	cl_mem             val, fail;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
+						  &matrix_program,
+						  NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	factor = *(int *)args;
+	n = STARPU_MATRIX_GET_NX(buffers[0]);
+	n*= STARPU_MATRIX_GET_NY(buffers[0]);
+	val = (cl_mem)STARPU_MATRIX_GET_DEV_HANDLE(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&matrix_program,
+					"matrix_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &matrix_config.copy_failed, &err);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* Setting args */
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					sizeof(val), &val,
+					sizeof(n), &n,
+					sizeof(fail), &fail,
+					sizeof(factor), &factor,
+					0);
+	if (nargs != 4)
+		STARPU_OPENCL_REPORT_ERROR(err);
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &matrix_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&matrix_program);
+}
+

+ 31 - 0
tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl

@@ -0,0 +1,31 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void matrix_opencl(__global int *val,
+				 unsigned int nx,
+				 __global int *err,
+				 int factor)
+{
+        const int i = get_global_id(0);
+	if (i >= nx)
+		return;
+
+	if (val[i] != i * factor)
+		*err = i;
+	else
+		val[i] *= -1;
+}
+

+ 0 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c


Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.